defget_html(url, params): uapools = [ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14' ]
defget_html(url, params): uapools = [ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14' ]
f = open("content.txt",'r',encoding='utf-8') for line in f: csvRow = line.split() writer.writerow(csvRow)
f.close() csvFile.close()
3.统计一天各个时间段内的评论数 py.py
# coding=gbk import csv
from pyecharts import options as opts from sympy.combinatorics import Subset from wordcloud import WordCloud
with open('../Spiders/data.csv') as csvfile: reader = csv.reader(csvfile)
data1 = [str(row[1])[0:2] for row in reader]
print(data1) print(type(data1))
#先变成集合得到seq中的所有元素,避免重复遍历 set_seq = set(data1) rst = [] for item in set_seq: rst.append((item,data1.count(item))) #添加元素及出现个数 rst.sort() print(type(rst)) print(rst)
with open("time2.csv", "w+", newline='', encoding='utf-8') as f: writer = csv.writer(f, delimiter=',') for i in rst: # 对于每一行的,将这一行的每个元素分别写在对应的列中 writer.writerow(i)
with open('time2.csv') as csvfile: reader = csv.reader(csvfile) x = [str(row[0]) for row in reader] print(x) with open('time2.csv') as csvfile: reader = csv.reader(csvfile) y1 = [float(row[1]) for row in reader] print(y1)
处理结果(评论时间,评论数)
4.统计最近评论数 py1.py
# coding=gbk import csv
from pyecharts import options as opts from sympy.combinatorics import Subset from wordcloud import WordCloud
with open('../Spiders/data.csv') as csvfile: reader = csv.reader(csvfile)
data1 = [str(row[0
]) for row in reader] #print(data1) print(type(data1))
#先变成集合得到seq中的所有元素,避免重复遍历 set_seq = set(data1) rst = [] for item in set_seq: rst.append((item,data1.count(item))) #添加元素及出现个数 rst.sort() print(type(rst)) print(rst)
with open("time1.csv", "w+", newline='', encoding='utf-8') as f: writer = csv.writer(f, delimiter=',') for i in rst: # 对于每一行的,将这一行的每个元素分别写在对应的列中 writer.writerow(i)
with open('time1.csv') as csvfile: reader = csv.reader(csvfile) x = [str(row[0]) for row in reader] print(x) with open('time1.csv') as csvfile: reader = csv.reader(csvfile) y1 = [float(row[1]) for row in reader]
# encoding: utf-8 # encoding: utf-8 import csv import pyecharts.options as opts from pyecharts.charts import Bar from pyecharts.globals import ThemeType
from pyecharts import options as opts from pyecharts.charts import Pie from random import randint
from pyecharts.globals import ThemeType
with open('time1.csv') as csvfile: reader = csv.reader(csvfile) x = [str(row[0]) for row in reader] print(x) with open('time1.csv') as csvfile: reader = csv.reader(csvfile) y1 = [float(row[1]) for row in reader]
print(y1)
num = y1 lab = x ( Pie(init_opts=opts.InitOpts(width='1700px',height='450px',theme=ThemeType.LIGHT))#默认900,600 .set_global_opts( title_opts=opts.TitleOpts(title="扫黑风暴近日评论统计", title_textstyle_opts=opts.TextStyleOpts(font_size=27)),legend_opts=opts.LegendOpts(
pos_top="10%", pos_left="1%",# 图例位置调整 ),) .add(series_name='',center=[280, 270], data_pair=[(j, i) for i, j in zip(num, lab)])#饼图 .add(series_name='',center=[845, 270],data_pair=[(j,i) for i,j in zip(num,lab)],radius=['40%','75%'])#环图 .add(series_name='', center=[1380, 270],data_pair=[(j, i) for i, j in zip(num, lab)], rosetype='radius')#南丁格尔图 ).render('../Html/pie_pyecharts.html')
效果图
5.制作每小时评论饼图 pie_pyecharts2.py
import csv
from pyecharts import options as opts from pyecharts.charts import Pie from random import randint
from pyecharts.globals import ThemeType
str_name1 = '点'
with open('time2.csv') as csvfile: reader = csv.reader(csvfile) x = [str(row[0]+str_name1) for row in reader] print(x) with open('time2.csv') as csvfile: reader = csv.reader(csvfile) y1 = [int(row[1]) for row in reader]
print(y1)
num = y1 lab = x ( Pie(init_opts=opts.InitOpts(width='1650px',height='500px',theme=ThemeType.LIGHT,))#默认900,600 .set_global_opts( title_opts=opts.TitleOpts(title="扫黑风暴每小时评论统计" ,title_textstyle_opts=opts.TextStyleOpts(font_size=27)), legend_opts=opts.LegendOpts(
pos_top="8%", pos_left="4%",# 图例位置调整 ), ) .add(series_name='',center=[250, 300], data_pair=[(j, i) for i, j in zip(num, lab)])#饼图 .add(series_name='',center=[810, 300],data_pair=[(j,i) for i,j in zip(num,lab)],radius=['40%','75%'])#环图 .add(series_name='', center=[1350, 300],data_pair=[(j, i) for i, j in zip(num, lab)], rosetype='radius')#南丁格尔图 ).render('../Html/pie_pyecharts2.html')
效果图
6.制作观看时间区间评论统计饼图 pie_pyecharts3.py
# coding=gbk import csv
from pyecharts import options as opts from pyecharts.globals import ThemeType from sympy.combinatorics import Subset from wordcloud import WordCloud
with open('../Spiders/data.csv') as csvfile: reader = csv.reader(csvfile)
data2 = [int(row[1].strip('')[0:2]) for row in reader]
#print(data2) print(type(data2))
#先变成集合得到seq中的所有元素,避免重复遍历 set_seq = set(data2) list = [] for item in set_seq: list.append((item,data2.count(item))) #添加元素及出现个数 list.sort() print(type(list)) #print(list)
with open("time2.csv", "w+", newline='', encoding='utf-8') as f: writer = csv.writer(f, delimiter=',') for i in list: # 对于每一行的,将这一行的每个元素分别写在对应的列中 writer.writerow(i)
n = 4#分成n组 m = int(len(list)/n) list2 = [] for i in range(0, len(list), m): list2.append(list[i:i+m])
with open('time2.csv') as csvfile: reader = csv.reader(csvfile) y1 = [int(row[1]) for
row in reader]
print(y1)
n =6 groups = [y1[i:i + n] for i in range(0, len(y1), n)]
print(groups)
x=['凌晨','上午','下午','晚上'] y1=[] for y1 in groups: num_sum = 0 for groups in y1: num_sum += groups
print(x) print(y1)
import csv
from pyecharts import options as opts from pyecharts.charts import Pie from random import randint
str_name1 = '点'
num = y1 lab = x ( Pie(init_opts=opts.InitOpts(width='1500px',height='450px',theme=ThemeType.LIGHT))#默认900,600 .set_global_opts( title_opts=opts.TitleOpts(title="扫黑风暴观看时间区间评论统计" , title_textstyle_opts=opts.TextStyleOpts(font_size=30)), legend_opts=opts.LegendOpts(
pos_top="8%", # 图例位置调整 ), ) .add(series_name='',center=[260, 270], data_pair=[(j, i) for i, j in zip(num, lab)])#饼图 .add(series_name='',center=[1230, 270],data_pair=[(j,i) for i,j in zip(num,lab)],radius=['40%','75%'])#环图 .add(series_name='', center=[750, 270],data_pair=[(j, i) for i, j in zip(num, lab)], rosetype='radius')#南丁格尔图 ).render('../Html/pie_pyecharts3.html')
效果图
7.制作扫黑风暴主演提及占比饼图 pie_pyecharts4.py
import csv
import numpy as np import re import jieba from matplotlib.pyplot import scatter from wordcloud import WordCloud from matplotlib import pyplot as plt from PIL import Image
# 上面的包自己安装,不会的就百度
f = open('../Spiders/content.txt', 'r', encoding='utf-8') # 这是数据源,也就是想生成词云的数据 words = f.read() # 读取文件 f.close() # 关闭文件,其实用with就好,但是懒得改了
from pyecharts import options as opts from pyecharts.charts import Pie from random import randint
from pyecharts.globals import ThemeType
num = count lab = name ( Pie(init_opts=opts.InitOpts(width='1650px',height='450px',theme=ThemeType.LIGHT))#默认900,600 .set_global_opts( title_opts=opts.TitleOpts(title="扫黑风暴主演提及占比", title_textstyle_opts=opts.TextStyleOpts(font_size=27)),legend_opts=opts.LegendOpts(
pos_top="3%", pos_left="33%",# 图例位置调整 ),) .add(series_name='',center=[280, 270], data_pair=[(j, i) for i, j in zip(num, lab)])#饼图
.add(series_name='',center=[800, 270],data_pair=[(j,i) for i,j in zip(num,lab)],radius=['40%','75%'])#环图 .add(series_name='', center=[1300, 270],data_pair=[(j, i) for i, j in zip(num, lab)], rosetype='radius')#南丁格尔图 ).render('../Html/pie_pyecharts4.html')
效果图
8.评论内容情感分析 SnowNLP.py
import numpy as np from snownlp import SnowNLP import matplotlib.pyplot as plt
f = open('../Spiders/content.txt', 'r', encoding='UTF-8') list = f.readlines() sentimentslist = [] for i in list: s = SnowNLP(i)