#词合并defMergeWord(self,T1,T2):
MergeWord =[]for i in T1:
MergeWord.append(i)for i in T2:if i notin MergeWord:
MergeWord.append(i)return MergeWord
1
2
3
4
5
6
7
8
9
1
2
3
4
5
6
7
8
9
合并方法很简单不再做解释。接下来添加向量计算方法:
# 得出文档向量defCalVector(self,T1,MergeWord):
TF1 =[0]*len(MergeWord)for ch in T1:
TermFrequence = T1[ch]
word = ch
if word in MergeWord:
TF1[MergeWord.index(word)]= TermFrequence
return TF1
1
2
3
4
5
6
7
8
9
1
2
3
4
5
6
7
8
9
最后添加相似度计算方法:
defcosine_similarity(self,vector1, vector2):
dot_product =0.0
normA =0.0
normB =0.0for a, b inzip(vector1, vector2):#两个向量组合成 [(1, 4), (2, 5), (3, 6)] 最短形式表现
dot_product += a * b
normA += a **2
normB += b **2if normA ==0.0or normB ==0.0:return0else:returnround(dot_product /((normA**0.5)*(normB**0.5))*100,2)
for k in url_content:print(k,'相似度:',Analyse.get_Tfidf(src,url_content[k]))
1
2
1
2
完整代码如下:
from Browser import Browser
from Analyse import Analyse
defread_txt(path=''):
f =open(path,'r')return f.read()
src=read_txt(r'F:\tool\textsrc\src.txt')#获取本地文本
Analyse=Analyse()#配置信息
conf={'kw':'php基础教程 第十一步 面向对象','engine':'baidu',}
drvier=Browser(conf)
url_content=drvier.search()#获取搜索结果及内容for k in url_content:print(k,'相似度:',Analyse.get_Tfidf(src,url_content[k]))
if real_url in self.conf['white_list']:#白名单continue
1
2
1
2
以上代码对白名单进行了判断,自己设置的白名单不加入到条数。
3.5新建Manage类
新建一python文件名为Manage,再次封装。代码如下:
from Browser import BrowserManage
from Analyse import Analyse
from FileHandle import FileHandle
classManage:def__init__(self,conf):
self.drvier=BrowserManage(conf)
self.textdic=FileHandle().get_text()
self.analyse=Analyse()defget_local_analyse(self):
resdic={}for k in self.textdic:
res={}
self.drvier.set_kw(k)
url_content=self.drvier.search()#获取搜索结果及内容for k1 in url_content:
res[k1]=self.analyse.get_Tfidf(self.textdic[k],url_content[k1])
resdic[k]=res
return resdic
from jieba import lcut
import jieba.analyse
import collections
from FileHandle import FileHandle
classAnalyse:defget_Tfidf(self,text1,text2):#测试对比本地数据对比搜索引擎方法# self.correlate.word.set_this_url(url)
T1 = self.Count(text1)
T2 = self.Count(text2)
mergeword = self.MergeWord(T1,T2)return self.cosine_similarity(self.CalVector(T1,mergeword),self.CalVector(T2,mergeword))#分词defCount(self,text):
tag = jieba.analyse.textrank(text,topK=20)
word_counts = collections.Counter(tag)#计数统计return word_counts
#词合并defMergeWord(self,T1,T2):
MergeWord =[]for i in T1:
MergeWord.append(i)for i in T2:if i notin MergeWord:
MergeWord.append(i)return MergeWord
# 得出文档向量defCalVector(self,T1,MergeWord):
TF1 =[0]*len(MergeWord)for ch in T1:
TermFrequence = T1[ch]
word = ch
if word in MergeWord:
TF1[MergeWord.index(word)]= TermFrequence
return TF1
#计算 TF-IDFdefcosine_similarity(self,vector1, vector2):
dot_product =0.0
normA =0.0
normB =0.0for a, b inzip(vector1, vector2):#两个向量组合成 [(1, 4), (2, 5), (3, 6)] 最短形式表现
dot_product += a * b
normA += a **2
normB += b **2if normA ==0.0or normB ==0.0:return0else:return
round(dot_product /((normA**0.5)*(normB**0.5))*100,2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
Browser类:
from selenium import webdriver
from bs4 import BeautifulSoup
from SearchEngine import EngineConfManage
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import hashlib
import time
import xlwt
classBrowser:def__init__(self,conf):
self.browser=webdriver.Chrome()
self.conf=conf
self.conf['kw']=''
self.engine_conf=EngineConfManage().get_Engine_conf(conf['engine']).get_conf()#搜索内容设置defset_kw(self,kw):
self.conf['kw']=kw
#搜索内容写入到搜素引擎中defsend_keyword(self):input= self.browser.find_element_by_id(self.engine_conf['searchTextID'])input.send_keys(self.conf['kw'])#搜索框点击defclick_search_btn(self):
search_btn = self.browser.find_element_by_id(self.engine_conf['searchBtnID'])
search_btn.click()#获取搜索结果与文本defget_search_res_url(self):
res_link={}
WebDriverWait(self.browser,timeout=30,poll_frequency=1).until(EC.presence_of_element_located((By.ID,"page")))#内容通过 BeautifulSoup 解析
content=self.browser.page_source
soup = BeautifulSoup(content,"html.parser")
search_res_list=soup.select('.'+self.engine_conf['searchContentHref_class'])whilelen(res_link)<self.conf['target_page']:for el in search_res_list:
js ='window.open("'+el.a['href']+'")'
self.browser.execute_script(js)
handle_this=self.browser.current_window_handle #获取当前句柄
handle_all=self.browser.window_handles #获取所有句柄
handle_exchange=None#要切换的句柄for handle in handle_all:#不匹配为新句柄if handle != handle_this:#不等于当前句柄就交换
handle_exchange = handle
self.browser.switch_to.window(handle_exchange)#切换
real_url=self.browser.current_url
if real_url in self.conf['white_list']:#白名单continue
time.sleep(1)
res_link[real_url]=self.browser.page_source #结果获取
self.browser.close()
self.browser.switch_to.window(handle_this)
content_md5=hashlib.md5(self.browser.page_source.encode(encoding='UTF-8')).hexdigest()#md5对比
self.click_next_page(content_md5)return res_link
#下一页defclick_next_page(self,md5):
WebDriverWait(self.browser,timeout=30,poll_frequency=1).until(EC.presence_of_element_located((By.ID,"page")))#百度搜索引擎翻页后下一页按钮 xpath 不一致 默认非第一页xpathtry:
next_page_btn = self.browser.find_element_by_xpath(self.engine_conf['nextPageBtnID_xpath_s'])except:
next_page_btn = self.browser.find_element_by_xpath(self.engine_conf['nextPageBtnID_xpath_f'])
next_page_btn.click()#md5 进行 webpag text 对比,判断是否已翻页 (暂时使用,存在bug)
i=0while md5==hashlib.md5(self.browser.page_source.encode(encoding='UTF-8')).hexdigest():#md5 对比
time.sleep(0.3)#防止一些错误,暂时使用强制停止保持一些稳定
i+=1if i>100:returnFalsereturnTrueclassBrowserManage(Browser):#打开目标搜索引擎进行搜索defsearch(self):
self.browser.get(self.engine_conf['website'])#打开搜索引擎站点
self.send_keyword()#输入搜索kw
self.click_search_btn()#点击搜索return self.get_search_res_url()#获取web页搜索数据
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
Manage类:
from Browser import BrowserManage
from Analyse import Analyse
from FileHandle import FileHandle
classManage:def__init__(self,conf):
self.drvier=BrowserManage(conf)
self.textdic=FileHandle().get_text()
self.analyse=Analyse()defget_local_analyse(self):
resdic={}for k in self.textdic:
res={}
self.drvier.set_kw(k)
url_content=self.drvier.search()#获取搜索结果及内容for k1 in url_content:
res[k1]=self.analyse.get_Tfidf(self.textdic[k],url_content[k1])
resdic[k]=res
return resdic
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
FileHandle类:
import os
classFileHandle:#获取文件内容defget_content(self,path):
f =open(path,"r")#设置文件对象
content = f.read()#将txt文件的所有内容读入到字符串str中
f.close()#将文件关闭return content
#获取文件内容defget_text(self):
file_path=os.path.dirname(__file__)#当前文件所在目录
txt_path=file_path+r'\textsrc'#txt目录
rootdir=os.path.join(txt_path)#目标目录内容
local_text={}# 读txt 文件for(dirpath,dirnames,filenames)in
os.walk(rootdir):for filename in filenames:if os.path.splitext(filename)[1]=='.txt':
flag_file_path=dirpath+'\\'+filename #文件路径
flag_file_content=self.get_content(flag_file_path)#读文件路径if flag_file_content!='':
local_text[filename.replace('.txt','')]=flag_file_content #键值对内容return local_text
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
本文最终使用方法如下:
from Manage import Manage
white_list=['blog.csdn.net/A757291228','www.cnblogs.com/1-bit','blog.csdn.net/csdnnews']#白名单#配置信息
conf={'engine':'baidu','target_page':5'white_list':white_list,}print(Manage(conf).get_local_analyse())