完整代码:第一步:爬取网站信息
import requestfrom bs4 import BeautifulSoupimport reimport timeall_jokes=[]headers={\’User-Agent\’: \’Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/99.0.4844.84 Safari/537.36 \’ }对于范围内的i(1,51): concent=requests.get(\’http://www.budejie.com/text/{}\’.format(i),headers=headers) 替换=concent.text.replace(\’
\’,\’\’)。交换(\’
\’,\’\’)。交换(\’
\’,\’\’)Soup=BeautifulSoup(替换,\’lxml\’)jokes=soup.find_all(\’p\’,attrs={\’class\’:\’j-r-list-c\’})jokes:的笑话print(joke.a .string ) all_jokes.append(joke.a.string) print(\’抓取{}页\’.format(i)) time.sleep(2)with open(\’jokes.txt\’,\’w\’,encoding=\’utf -8 \’)as f: for j in all_jokes: f.write(str(j)) f.write(\’\\n\’)第2 步:分析文本。
file=open(\’jokes.txt\’,\’r\’,encoding=\’utf-8\’)lines=file.readlines()line=str(lines)import jiebaline=jieba.cut(line)x=\’ \’.join (line)# print(x)with open(\’cutjokes.txt\’,\’w\’,encoding=\’utf-8\’)as f: f.write(x)from sklearn.feature_extraction.text import TfidfVectorizerfrom sklearn.decomposition import LatentDirichletAllocationdef print_topics(model,feature_names,n_top_words): for topic_idx,topic in enumerate(model.components_): message=\’topic#%{}\’.format(topic_idx) message+=\’ \’.join([feature_names[i] for i in topic.argsort()[:-n_top_words-1:-1]]) print(message) print()if __name__==\’__main__\’: f=open(\’cutjokes.txt\’, \’r\’,encoding=\’utf- 8\’) tf=TfidfVectorizer(max_features=1000)#最大特征数为1000 x_train=tf.fit_transform(f) lda=LatentDirichletAllocation(n_components=10)#提取10个主题lda.fit_transform(x_train) print(lda) n_top_words=20#从每个主题中提取20个高频词print_topics(lda,tf.get_feature_names(),n_top_words)
本文和图片来自网络,不代表火豚游戏立场,如若侵权请联系我们删除:https://www.huotun.com/game/672294.html