0.前言
最近爱情公寓上线了大电影,卖的就是情怀,但是据说豆瓣上已经给出了2.7的评分,也就比第一烂片逐梦演艺圈高那么一点点,于是今天写了一个爬虫,爬了豆瓣一千多条短评,看一看这个电影为什么这么渣。
1.代码
废话不说,直接代码,因为还是挺简单的
from lxml import etree
from urllib import request
import ssl
from jieba import analyse
import jieba
from wordcloud import WordCloud, STOPWORDS
from matplotlib import pyplot as plt
//得到所有短评
def get_comments(comments):
print(comments)
root = etree.HTML(comments)
comment_content = ''
comment_list = root.xpath('//*[@class="short-content"]/text()')
print('comment size is %d' % len(comment_list))
for comment in comment_list:
comment_content += comment
comment_content = comment_content.replace('()', '')
print("comment_content is %s" % comment_content)
return comment_content
//生成词云
def generate_wordcloud(comments):
stopwords = set(STOPWORDS)
stopwords.add('爱情')
stopwords.add('公寓')
stopwords.add('电影')
stopwords.add('评分')
stopwords.add('...')
comments = jieba.cut(comments)
comments = [comment for comment in comments if comment not in stopwords]
comments = ' '.join(comments)
comments = str(analyse.extract_tags(comments, topK=300))
background = plt.imread('pic.jpg')
wc = WordCloud(font_path='msyh.ttf', random_state=30, width=800, height=800,
mask=background, background_color='white',
max_font_size=80, max_words=2000)
wc = wc.generate_from_text(comments)
plt.imshow(wc)
plt.axis('off')
plt.show()
//调用上面的方法生成词云
url = 'https://movie.douban.com/subject/24852545/reviews?start=%d'
ssl._create_default_https_context = ssl._create_unverified_context
comments = ''
for i in range(0, 20):
offset_url = url % (20 * i)
print("url is %s" % offset_url)
with request.urlopen(offset_url) as f:
content = f.read()
comment = get_comments(content)
comments += comment
generate_wordcloud(comments)