defcalculate_similar(wset, wset_len, title): tset = set(jieba.lcut_for_search(title)) try: similar = 1 - len(wset - tset) / wset_len except ZeroDivisionError: similar = 1.0 return similar
defmake_new_title(title, content, rmstop): title_len = len(title) title_cut = set(jieba.lcut_for_search(title)) max_title_len = title_len * 2 min_title_len = title_len * 0.5 sentences = sep.split(content) candi_senes = [sen.strip() for sen in sentences if min_title_len < len(sen.strip()) < max_title_len] good_titles = [] for candi_sen in candi_senes: similar = calculate_similar(title_cut, title_len, candi_sen) new_title = rmstop.sub('', candi_sen) good_titles.append((candi_sen, new_title, similar)) good_titles.sort(key=lambda x: x[-1], reverse=True) print(good_titles)
deftest(): # 停止词,本来想着去除掉句子中的停止词的,但是发现效果并不好,可读性很差 res = r'|'.join(w.strip() for w in open("stopwords.txt", encoding="utf-8") if w.strip()) stop_words = re.compile(res) # test.txt 是测试的内容,纯文本。 with open('test.txt', encoding=
'utf-8') as f: content = f.read() # 原标题 title = "场均净胜43.8分!史上最残暴的球队到底多恐怖?" make_new_title(title, content, stop_words)