fix bugs and support all platform

This commit is contained in:
2025-08-15 08:33:47 +08:00
parent e82b85f4dd
commit 4945b4c6b0
36 changed files with 2296 additions and 992 deletions

View File

@@ -26,8 +26,9 @@ def get_selenium_driver():
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
chrome_options.add_argument(
"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)
return driver
@@ -35,6 +36,7 @@ def get_selenium_driver():
print(f"创建Selenium WebDriver失败: {e}")
return None
def get_page_with_selenium(url, website_name):
"""使用Selenium获取动态加载的页面内容"""
driver = None
@@ -42,17 +44,17 @@ def get_page_with_selenium(url, website_name):
driver = get_selenium_driver()
if not driver:
return None
print(f"使用Selenium加载页面: {url}")
driver.get(url)
# 等待页面加载完成
wait_time = 10
if "学习强国" in website_name:
wait_time = 15 # 学习强国需要更长时间
elif "法治日报" in website_name:
wait_time = 12 # 法治日报需要较长时间
# 等待页面主要内容加载
try:
WebDriverWait(driver, wait_time).until(
@@ -60,14 +62,14 @@ def get_page_with_selenium(url, website_name):
)
except:
print(f"等待页面加载超时: {url}")
# 额外等待时间确保动态内容加载完成
time.sleep(3)
# 获取页面源码
page_source = driver.page_source
return page_source
except Exception as e:
print(f"Selenium获取页面失败: {url}, 错误: {e}")
return None
@@ -78,6 +80,7 @@ def get_page_with_selenium(url, website_name):
except:
pass
def download_media(url, save_dir):
try:
# 添加请求头以避免403 Forbidden错误
@@ -236,7 +239,7 @@ def process_article(url, website):
need_selenium = False
if any(name in website.name for name in ["学习强国", "xuexi", "法治日报", "legaldaily"]):
need_selenium = True
try:
if need_selenium:
# 使用Selenium获取动态加载的内容
@@ -244,28 +247,28 @@ def process_article(url, website):
if not page_source:
print(f"Selenium获取页面失败{url}")
return
# 检查页面内容是否过短
min_length = 100 if "法治日报" in website.name else 300
if len(page_source) < min_length:
print(f"页面内容过短,可能是重定向页面:{url}")
return
# 创建BeautifulSoup对象
soup = BeautifulSoup(page_source, "html.parser")
else:
# 使用requests获取静态内容
resp = requests.get(url, headers=headers, timeout=15, allow_redirects=True)
resp.raise_for_status()
# 检查是否是重定向页面
if len(resp.text) < 300:
print(f"页面内容过短,可能是重定向页面:{url}")
return
# 创建BeautifulSoup对象
soup = BeautifulSoup(resp.text, "html.parser")
except Exception as e:
print(f"请求失败:{url},错误:{e}")
return
@@ -353,7 +356,7 @@ def process_article(url, website):
heading_text = heading.get_text(strip=True)
if title_text in heading_text or heading_text in title_text:
heading.decompose()
# 移除class包含title的元素
for title_element in content_tag.find_all(class_=lambda x: x and "title" in x):
title_element_text = title_element.get_text(strip=True)
@@ -489,13 +492,13 @@ def process_article(url, website):
soup.find("p", class_="title") or
soup.find("title")
)
# 针对求是的特殊处理如果标题为空或太短尝试从title标签提取
if title_tag:
title_text = title_tag.get_text(strip=True)
if not title_text or len(title_text) < 5:
title_tag = soup.find("title")
# 针对求是的特殊处理,确保标题被正确提取
if not title_tag or not title_tag.get_text(strip=True):
title_tag = soup.find("title")
@@ -522,7 +525,7 @@ def process_article(url, website):
# 如果 strong 在正文前两段内,就删除
if parent_p in content_tag.find_all("p")[:2]:
strong_tag.decompose()
# 移除h1、h2、h3标题元素中的重复标题
for heading in content_tag.find_all(["h1", "h2", "h3"]):
heading_text = heading.get_text(strip=True)
@@ -530,11 +533,12 @@ def process_article(url, website):
# 确保不删除title_tag本身
if heading != title_tag:
heading.decompose()
# 移除class包含title的元素
for title_element in content_tag.find_all(class_=lambda x: x and "title" in x):
title_element_text = title_element.get_text(strip=True)
if title_element_text and (title_text in title_element_text or title_element_text in title_text):
if title_element_text and (
title_text in title_element_text or title_element_text in title_text):
# 确保不删除title_tag本身
if title_element != title_tag:
title_element.decompose()
@@ -583,7 +587,7 @@ def process_article(url, website):
soup.find("h2") or # 解放军报使用h2标签作为标题
soup.find("title")
)
# 针对解放军报的特殊处理如果标题为空或太短尝试从title标签提取
if title_tag:
title_text = title_tag.get_text(strip=True)
@@ -606,34 +610,34 @@ def process_article(url, website):
# 移除面包屑导航
for breadcrumb in content_tag.find_all("ol", class_="breadcrumb"):
breadcrumb.decompose()
# 移除分享相关元素
for share_element in content_tag.find_all("div", class_="share-custom"):
share_element.decompose()
# 移除作者信息段落
for author_p in content_tag.find_all("p"):
text = author_p.get_text(strip=True)
if "来源:" in text or "作者:" in text or "责任编辑:" in text or "发布:" in text:
author_p.decompose()
# 移除进度条
for progress in content_tag.find_all("div", class_="progress-bar"):
progress.decompose()
# 移除播放器
for player in content_tag.find_all("div", class_="player"):
player.decompose()
# 移除媒体URL容器
for media in content_tag.find_all("div", id="mediaurl"):
media.decompose()
# 移除新闻列表(但保留其中的内容)
for news_list in content_tag.find_all("ul", id="main-news-list"):
# 不删除整个ul而是unwrap它保留其中的内容
news_list.unwrap()
# 移除编辑信息
for editor_element in content_tag.find_all("div", class_="editor"):
editor_element.decompose()
@@ -744,13 +748,13 @@ def process_article(url, website):
soup.find("h1") or
soup.find("title")
)
# 针对工人日报的特殊处理如果标题为空或太短尝试从title标签提取
if title_tag:
title_text = title_tag.get_text(strip=True)
if not title_text or len(title_text) < 5:
title_tag = soup.find("title")
# 进一步处理如果h1标题包含太多无关信息尝试从title标签提取更简洁的标题
if title_tag and title_tag.name == 'h1':
title_text = title_tag.get_text(strip=True)
@@ -877,11 +881,11 @@ def process_article(url, website):
soup.find("h1") or
soup.find("title")
)
# 针对中国纪检监察报的特殊处理,确保标题被正确提取
if not title_tag or not title_tag.get_text(strip=True):
title_tag = soup.find("title")
content_tag = (
soup.find("div", class_="content") or
soup.find("div", class_="article-content") or
@@ -955,11 +959,11 @@ def process_article(url, website):
soup.find("h1") or
soup.find("title")
)
# 针对中国青年报的特殊处理,确保标题被正确提取
if not title_tag or not title_tag.get_text(strip=True):
title_tag = soup.find("title")
content_tag = (
soup.find("div", class_="main") or # 中国青年报特有内容容器
soup.find("div", class_="content") or
@@ -977,11 +981,11 @@ def process_article(url, website):
soup.find("h1") or
soup.find("title")
)
# 针对中国妇女报的特殊处理,确保标题被正确提取
if not title_tag or not title_tag.get_text(strip=True):
title_tag = soup.find("title")
content_tag = (
soup.find("div", class_="main") or # 中国妇女报特有内容容器
soup.find("div", class_="news") or # 中国妇女报特有内容容器
@@ -1001,11 +1005,11 @@ def process_article(url, website):
soup.find("h1") or
soup.find("title")
)
# 针对法治日报的特殊处理,确保标题被正确提取
if not title_tag or not title_tag.get_text(strip=True):
title_tag = soup.find("title")
content_tag = (
soup.find("div", class_="content-two") or # 优先查找content-two类
soup.find("div", class_="article-content") or # 法治日报特有内容容器
@@ -1058,13 +1062,13 @@ def process_article(url, website):
soup.find("h1") or
soup.find("title")
)
# 针对农民日报的特殊处理如果标题出现乱码尝试从title标签提取
if title_tag and title_tag.name == 'h1':
title_text = title_tag.get_text(strip=True)
if title_text and any(char in title_text for char in ['', '', '']):
title_tag = soup.find("title")
# 针对农民日报的特殊处理,确保标题被正确提取
if not title_tag or not title_tag.get_text(strip=True):
title_tag = soup.find("title")
@@ -1078,7 +1082,7 @@ def process_article(url, website):
soup.find("div", class_="article") or
soup.find("div", class_="article-body")
)
# 针对农民日报的特殊处理如果找到多个detailCon选择内容最长的那个
if content_tag and content_tag.get('class') and 'detailCon' in content_tag.get('class', []):
detail_cons = soup.find_all("div", class_="detailCon")
@@ -1116,17 +1120,17 @@ def process_article(url, website):
soup.find("h1") or
soup.find("title")
)
# 针对学习强国的特殊处理,确保标题被正确提取
if not title_tag or not title_tag.get_text(strip=True):
title_tag = soup.find("title")
# 针对学习强国的特殊处理如果标题太短尝试从title标签提取
if title_tag:
title_text = title_tag.get_text(strip=True)
if title_text and len(title_text) < 10:
title_tag = soup.find("title")
content_tag = (
soup.find("div", class_="content") or
soup.find("div", class_="article-content") or
@@ -1153,17 +1157,17 @@ def process_article(url, website):
soup.find("h1") or
soup.find("title")
)
# 针对旗帜网的特殊处理如果标题为空或太短尝试从title标签提取
if title_tag:
title_text = title_tag.get_text(strip=True)
if not title_text or len(title_text) < 5:
title_tag = soup.find("title")
# 针对旗帜网的特殊处理,确保标题被正确提取
if not title_tag or not title_tag.get_text(strip=True):
title_tag = soup.find("title")
# 针对旗帜网的特殊处理如果标题太短尝试从title标签提取
if title_tag:
title_text = title_tag.get_text(strip=True)
@@ -1232,13 +1236,13 @@ def process_article(url, website):
soup.find("h1") or
soup.find("title")
)
# 针对中国网的特殊处理如果标题为空或太短尝试从title标签提取
if title_tag:
title_text = title_tag.get_text(strip=True)
if not title_text or len(title_text) < 5:
title_tag = soup.find("title")
content_tag = (
soup.find("div", class_="article") or # 中国网特有内容容器
soup.find("div", class_="main") or
@@ -1281,7 +1285,7 @@ def process_article(url, website):
# 最终标题处理 - 只有在没有网站特定处理时才使用默认处理
if not title_tag:
title_tag = soup.find("h1") or soup.find("title")
title = title_tag.get_text(strip=True) if title_tag else "无标题"
# 对标题进行额外处理,去除可能的多余空白字符
@@ -1564,7 +1568,7 @@ def full_site_crawler(start_url, website, max_pages=1000):
("/content/" in path) or
(path.startswith("/detail/") and len(path) > 10)
)
# 排除列表页面
if "/index.html" in path or path.endswith("/"):
is_article_page = False