1.选一个自己感兴趣的主题。
2.网络上爬取相关的数据。
3.进行文本分析,生成词云。
4.对文本分析结果解释说明。
5.写一篇完整的博客,附上源代码、数据爬取及分析结果,形成一个可展示的成果。
1我对车感兴趣
2进行数据爬取
url = "http://guangzhou.bitauto.com/"res = requests.get(url)res.encoding = "utf-8"soup = BeautifulSoup(res.text, "html.parser")output = open("rrr.txt", "a+", encoding="utf-8")for p in soup.find_all("p"): output.write(p.get_text() + "\n")output.close()txt = open("rrr.txt", "r", encoding="utf-8").read()
words = jieba.lcut(txt)ls = []counts = {}for word in words: ls.append(word) if len(word) == 1: continue else: counts[word] = counts.get(word,0)+1items = list(counts.items())items.sort(key = lambda x:x[1], reverse = True)for i in range(10): word , count = items[i] print ("{:<5}{:>2}".format(word,count))
3生成词云
wordlist = jieba.cut(txt, cut_all=True)wl_split = "/".join(wordlist)hzt = WordCloud(font_path='msyh.ttc').generate(wl_split)plt.imshow(hzt)plt.axis("off")plt.show()
4结果分析