2. 下指令: python -m pytest . tests --doctest-modules --junitxml=test-results.xml --cov-config=.coveragerc --cov=. --cov-report=html
- 參數使用 timeline : 搜尋時間 => 參數可以參考qdr:h (past hour), qdr:d (past day),qdr:w (past week),qdr:m (past month),qdr:y (past year)
- 參數使用 page : 換頁
def google_search(self,query,timeline='',page='0'):
url = self.url + query + '&tbs={timeline}&start={page}'.format(timeline=timeline,page=page)
print('[Check][URL] URL : {url}'.format(url=url))
response = self.get_source(self.url + query)
return self.parse_googleResults(response)
def get_source(self,url):
try:
session = HTMLSession()
response = session.get(url)
return response
except requests.exceptions.RequestException as e:
print(e)
def parse_googleResults(self,response):
css_identifier_result = "tF2Cxc"
css_identifier_title = "h3"
css_identifier_link = "yuRUbf"
css_identifier_text = "VwiC3b"
soup = BeautifulSoup(response.text, 'html.parser')
results = soup.findAll("div", {"class": css_identifier_result})
output = []
for result in results:
item = {
'title': result.find(css_identifier_title).get_text(),
'link': result.find("div", {"class": css_identifier_link}).find(href=True)['href'],
'text': result.find("div", {"class": css_identifier_text}).get_text()
}
output.append(item)
return output
- 使用get_source 這支Function,取得網路資源後,
- 使用html_parser 這支Function,將網路資源進行解析。
- 使用html_getText 這支Function,將我要的區塊p tag的文字取出來。
def get_source(self,url):
try:
session = HTMLSession()
response = session.get(url)
return response
except requests.exceptions.RequestException as e:
print(e)
def html_parser(self,htmlText):
soup = BeautifulSoup(htmlText, 'html.parser')
return soup
def html_getText(self,soup):
orignal_text = ''
for el in soup.find_all('p'):
orignal_text += ''.join(el.find_all(text=True))
return orignal_text
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
def word_count(self, text):
counts = dict()
stop_words = set(stopwords.words('english'))
words = word_tokenize(text)
#words = text.replace(',','').split()
for word in words:
if word not in stop_words:
if word in counts:
counts[word] += 1
else:
counts[word] = 1
return counts
def get_wordcount_json(self,whitelist , dict_data):
data_array = []
for i in whitelist:
json_data = {
'Date' : 'Week1',
'Company' : i ,
'Count' : dict_data[i]
}
data_array.append(json_data)
return data_array
def jsonarray_toexcel(self,data_array):
df = pd.DataFrame(data=data_array)
df.to_excel('result.xlsx' , index=False)
return