如何解决Pandas python 数据在输出到 Excel 表时没有被存储
我只是想对网站进行网络抓取,并使用 Pandas 将数据导出到 excel。结果只打印列而不是结果,尽管 .appending to data (list).. 我做了很多谷歌搜索,最后来到这里寻求建议......我已经更新了上一个问题,因为它可能不是根据最小可行产品是否足够?提问要求..
硬编码的静态变量 进入网站 循环所有县 找出要循环浏览的帖子数 收集 URL 链接 for循环进入每个Page并提取数据并不断追加信息到data=[]
最后导出并打印到excel..
那是我的理想计划……看起来很近,但又很远……
import os
import sys
import time
import math
import urllib.request
import time
import numpy as np
import pandas as pd
from Stamprally import StamprallyInfo
from selenium import webdriver as wd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import webdriverwait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
prefectureNameList = ["海外"]
#,"北海道地方","北海道","東北地方","青森県","岩手県","宮城県","秋田県","山形県","福島県","関東地方","茨城県","栃木県","群馬県","埼玉県","千葉県","東京都","神奈川県","中部地方","新潟県","富山県","石川県","福井県","山梨県","長野県","岐阜県","静岡県",# "愛知県","三重県","近畿地方","滋賀県","京都府","大阪府","兵庫県","奈良県","和歌山県","中国地方","鳥取県","島根県","岡山県","広島県","山口県","四国地方","徳島県","香川県","愛媛県","高知県","九州・沖縄地方","福岡県","佐賀県","長崎県","熊本県","大分県","宮崎県","鹿児島県","沖縄県"]
data = []
contentAggregator = []
df = pd.DataFrame(data,columns=["Total List Number","Prefecture","ListLink","Location Tag","Event Tag","Available Period","Available StartDate","End Date","Last Updated","Main Image URL","Title","innerWebSiteURL","mainText"])
main_url = 'https://stamprally.org/'
driver = wd.Chrome(executable_path='chromedriver.exe')
driver.get(main_url)
prefectureValueStorage = driver.find_element_by_xpath(
"//*[@id='header_search_cat1']/option[2]").get_attribute('value')
# [x.get_attribute('value') for x in driver.find_elements_by_xpath(
# "//select[@name='search_cat1']/option[@class='level-1' or @class='level-0']")]
prefectureNameIterator = -1
# Loop through all the different prefectures
# for prefectureValue in prefectureValueStorage:
start = time.time()
prefectureNameIterator += 1
driver.get(
f"https://stamprally.org/?search_keywords&search_keywords_operator=and&search_cat1={145}&search_cat2=0")
print("START OF PREFECTURE " + prefectureNameList[prefectureNameIterator])
# Calculate How Many Times To Run Page Loop
imageDownloadCounter = 1
totalList = driver.find_element_by_css_selector(
'div.page_navi2.clearfix>p').text # .get_attribute('text')
totalListNum = totalList.split("件中")
# Add TotalListNum to the contentAggregator
contentAggregator.append(int(totalListNum[0]))
if int(totalListNum[0]) % 10 != 0:
pageLoopCount = math.ceil((int(totalListNum[0])/10))
else:
pageLoopCount = int(totalListNum[0])/10
# continue
currentpage = 0
while currentpage < pageLoopCount:
currentpage += 1
print("Current Page " + str(currentpage))
# ========================================================================================================================================================
# # Loop through all the Listings within the prefecture page
driver.get(
f"https://stamprally.org/?search_keywords&search_keywords_operator=and&search_cat1={145}&search_cat2=0&paged={currentpage}")
# print("Loading Page %s" % currentpage)
# ========================================================================================================================================================
# Add prefectureName to the contentAggregator
# contentAggregator.append(prefectureNameList[prefectureNameIterator])
# Gather All List Links
urlList = []
currentUrlCounter = 0
listURLContainer = driver.find_elements_by_css_selector(
'#post_list2 > li > a')
# Put all the lists in one Array
for url in listURLContainer:
urlList.append(url.get_attribute('href'))
# Loop through all the links
for listURL in listURLContainer:
contentAggregator = []
# Add TotalListNum to the contentAggregator
contentAggregator.append(int(totalListNum[0]))
# Add prefectureName to the contentAggregator
contentAggregator.append(
prefectureNameList[prefectureNameIterator])
print('article Link: ')
print(urlList[currentUrlCounter])
# Add listLink to the contentAggregator
contentAggregator.append(
urlList[currentUrlCounter])
# for Each Links in listURLContainer:
driver.get(urlList[currentUrlCounter])
currentUrlCounter += 1
locationTag = [x.get_attribute('title') for x in driver.find_elements_by_xpath(
"//*[@id='post_Meta_top']/li[1]/a[@class='cat-category']")]
print(locationTag)
# Add locationTag to the contentAggregator
contentAggregator.append(locationTag)
eventTag = [x.get_attribute('title') for x in driver.find_elements_by_xpath(
"//*[@id='post_Meta_top']/li[2]/a[@class='cat-category2']")]
contentAggregator.append(eventTag)
print(eventTag)
availablePeriod = (driver.find_element_by_css_selector(
'div#post_date')).text.split("( ")
availablePeriodFormatted = availablePeriod[0].replace("開催期間:","")
availableStartDate = availablePeriod[0].split(" ~ ")
endDate = availableStartDate[1]
availableStartDateFormatted = availableStartDate[0].replace(
"開催期間:","")
# Select Latest Update Date
lastUpdatedDate = driver.find_element_by_css_selector(
'time.entry-date.updated').text
print("Available Period:")
print(availablePeriodFormatted)
# Add Available Period to the contentAggregator
contentAggregator.append(availablePeriodFormatted)
print("Available StartDate:")
print(availableStartDateFormatted)
# Add Available StartDate to the contentAggregator
contentAggregator.append(availableStartDateFormatted)
print("End Date: ")
print(endDate)
# Add endDate to the contentAggregator
contentAggregator.append(endDate)
print("Last Updated:")
print(lastUpdatedDate[6:])
# Add lastUpdatedDate to the contentAggregator
contentAggregator.append(lastUpdatedDate[6:])
# ========================================================================================================================================================
# Download Main Post Image
mainImageUrl = driver.find_element_by_css_selector(
'img.attachment-post-thumbnail.size-post-thumbnail.wp-post-image').get_attribute('src')
# Add lastUpdatedDate to the contentAggregator
contentAggregator.append(mainImageUrl)
# Save Post Main Title
postTitle = driver.find_element_by_css_selector(
'h2#post_title').text.replace(" 開催終了","")
print("Title: ")
print(postTitle)
# Add Title to the contentAggregator
contentAggregator.append(postTitle)
# Save Post Main Image
urllib.request.urlretrieve(mainImageUrl,(str(
prefectureNameList[prefectureNameIterator])+postTitle+str(imageDownloadCounter) + ".png"))
imageDownloadCounter += 1
# Get Inner Website Link
innerWebSiteButtonURL = driver.find_element_by_css_selector(
'div.post_content.clearfix > div >a').get_attribute('href')
print("inner Website Button URL: " + innerWebSiteButtonURL)
# Add innerWebSiteURL to the contentAggregator
contentAggregator.append(innerWebSiteButtonURL)
# Gather Main Post Text Content
mainText = driver.find_elements_by_css_selector(
'div.post_content.clearfix > p')
mainContentText = []
# Remove disclamimer text
for mainContentDetail in mainText:
mainContentText.append(mainContentDetail.text)
mainContextTextCount = len(mainContentText)-1
print(mainContentText[:mainContextTextCount])
# Add Main Post Text Content to the contentAggregator
contentAggregator.append(mainContentText[:mainContextTextCount])
# ========================================================================================================================================================
contentReorder = [1,10,5,6,7,8,12,3,4,9,11,2]
contentAggregator = [contentAggregator[i] for i in contentReorder]
print("=====================================================================================================================================================")
print(contentAggregator)
data.append(contentAggregator)
print(data)
print(pd.DataFrame(data,"mainText"]))
end = time.time()
print(end - start)
xlwriter = pd.ExcelWriter('StampRally_Crawler.xlsx')
df.to_excel(xlwriter,sheet_name="Stamprally.org Crawl Result")
xlwriter.close()
# ========================================================================================================================================================
# Close Off
driver.close()
driver.quit()
sys.exit()
解决方法
你已经初始化了你的 df
df = pd.DataFrame(data,columns=["Total List Number","Prefecture","ListLink","Location Tag","Event Tag","Available Period","Available StartDate","End Date","Last Updated","Main Image URL","Title","innerWebSiteURL","mainText"])
在循环中,您正在打印数据帧,但尚未将其分配给 df。您可以尝试移动我在计算结束时间之前粘贴的代码行。
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。