如何解决无法将文章内容附加到列表
使用 python news3k 包,我试图遍历网站上的所有文章,并使用文章内容构建一个数据框。
文章的元数据是一个嵌套的字典,我可以将它从一篇文章中提取出来,但是当我循环遍历它们时就不行了。列表的长度为 0
from rake_nltk import Rake
import readability
import newspaper
from newspaper import Config
from newspaper import Article
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download([
"names","stopwords","state_union","twitter_samples","movie_reviews","averaged_perceptron_tagger","vader_lexicon","punkt",])
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/90.0.4430.212 Safari/537.36'
config = Config()
config.browser_user_agent = USER_AGENT
config.request_timeout = 10
base_url = 'https://www.marketwatch.com'
article_urls = set()
marketwatch = newspaper.build(base_url,config=config,memoize_articles=False,language='en')
title = []
sitename = []
og_type = []
url = []
og_description = []
twitter_identifier = []
twitter_id = []
fb_id = []
author = []
section = []
pub_date = []
tags = []
for sub_article in marketwatch.articles[0:10]:
try:
article = Article(sub_article.url,language='en')
article.download()
article.parse()
if article.url not in article_urls:
article_urls.add(article.url)
# The majority of the article elements are located
# within the Meta data section of the page's
# navigational structure
article_Meta_data = article.Meta_data
published_date = {value for (key,value) in article_Meta_data.items() if key == 'parsely-pub-date'}
article_published_date = " ".join(str(x) for x in published_date)
authors = sorted({value for (key,value) in article_Meta_data.items() if key == 'parsely-author'})
article_author = ','.join(authors)
title = {value for (key,value) in article_Meta_data.items() if key == 'parsely-title'}
article_title = " ".join(str(x) for x in title)
keywords = ''.join({value for (key,value) in article_Meta_data.items() if key == 'keywords'})
keywords_list = sorted(keywords.lower().split(','))
article_keywords = ','.join(keywords_list)
tags = ''.join({value for (key,value) in article_Meta_data.items() if key == 'parsely-tags'})
tag_list = sorted(tags.lower().split(','))
article_tags = ','.join(tag_list)
summary = {value for (key,value) in article_Meta_data.items() if key == 'description'}
article_summary = " ".join(str(x) for x in summary)
# the replace is used to remove newlines
article_text = article.text.replace('\n','')
for key,value in article.Meta_data.items():
print(key,' : ',value)
# Trying to append content to the list
title.append(article.Meta_data['og']['title'])
sitename.append(article.Meta_data['og']['site_name'])
og_type.append(article.Meta_data['og']['type'])
url.append(article.Meta_data['og']['url'])
og_description.append(article.Meta_data['og']['description'])
twitter_identifier.append(article.Meta_data['twitter']['site']['identifier'])
twitter_id.append(article.Meta_data['twitter']['site']['id'])
fb_id.append(article.Meta_data['fb']['app_id'])
author.append(article.Meta_data['author'])
section.append(article.Meta_data['parsely-section'])
pub_date.append(article.Meta_data['parsely-pub-date'])
tags.append(article.Meta_data['parsely-tags'])
print()
except:
pass
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。