import requests
from lxml import html
import pandas as pd
import numpy as np
def html_to_df_lxml(target_div):
# 获取字段名
headers = target_div.xpath('.//div[1]/table//th//text()')
headers = [header.strip() for header in headers if header.strip() != '']
# 获取数据
table_data = []
rows = target_div.xpath('.//div[2]/table//tr')
for row in rows:
# 忽略 style="height:0;font-size:0" 的行
if 'style' in row.attrib and row.attrib['style'] == "height:0;font-size:0":
continue
columns = row.xpath('.//td')
output_row = {}
for i, column in enumerate(columns):
if headers[i] == "重要性":
img_src = column.xpath('./div/img/@src')
if 'oneStar' in img_src[0]:
output_row[headers[i]] = 1
elif 'twoStar' in img_src[0]:
output_row[headers[i]] = 2
elif 'threeStar' in img_src[0]:
output_row[headers[i]] = 3
else:
output_row[headers[i]] = np.nan
elif headers[i] == "指标":
spans = column.xpath('./div/span/text()')
output_row[headers[i]] = ' '.join(spans)
else:
text = column.xpath('.//text()')
text = [t for t in text if t.strip() != '']
if text:
text = text[0].strip()
else:
text = np.nan
output_row[headers[i]] = text
table_data.append(output_row)
df = pd.DataFrame(table_data)
# forward fill NaN values in the '时间' column
df['时间'].fillna(method='ffill', inplace=True)
return df
# 目标网页
url = "https://www.fx168news.com/calendar"
# 使用requests库发送GET请求
response = requests.get(url)
# 解析HTML内容
tree = html.fromstring(response.content)
# 获取特定的div内容
target_div = tree.xpath("/html/body/div[1]/div[1]/div[3]/div[2]/div[4]/div[2]/div[2]/div/div/div/div/div")
# 你的xpath可能得到一个列表,我们只取第一个元素
if target_div:
df = html_to_df_lxml(target_div[0])
print(df)