fix bugs and support all platform
This commit is contained in:
106
core/utils.py
106
core/utils.py
@@ -26,8 +26,9 @@ def get_selenium_driver():
|
||||
chrome_options.add_argument("--disable-dev-shm-usage")
|
||||
chrome_options.add_argument("--disable-gpu")
|
||||
chrome_options.add_argument("--window-size=1920,1080")
|
||||
chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
|
||||
|
||||
chrome_options.add_argument(
|
||||
"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
|
||||
|
||||
service = Service(ChromeDriverManager().install())
|
||||
driver = webdriver.Chrome(service=service, options=chrome_options)
|
||||
return driver
|
||||
@@ -35,6 +36,7 @@ def get_selenium_driver():
|
||||
print(f"创建Selenium WebDriver失败: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def get_page_with_selenium(url, website_name):
|
||||
"""使用Selenium获取动态加载的页面内容"""
|
||||
driver = None
|
||||
@@ -42,17 +44,17 @@ def get_page_with_selenium(url, website_name):
|
||||
driver = get_selenium_driver()
|
||||
if not driver:
|
||||
return None
|
||||
|
||||
|
||||
print(f"使用Selenium加载页面: {url}")
|
||||
driver.get(url)
|
||||
|
||||
|
||||
# 等待页面加载完成
|
||||
wait_time = 10
|
||||
if "学习强国" in website_name:
|
||||
wait_time = 15 # 学习强国需要更长时间
|
||||
elif "法治日报" in website_name:
|
||||
wait_time = 12 # 法治日报需要较长时间
|
||||
|
||||
|
||||
# 等待页面主要内容加载
|
||||
try:
|
||||
WebDriverWait(driver, wait_time).until(
|
||||
@@ -60,14 +62,14 @@ def get_page_with_selenium(url, website_name):
|
||||
)
|
||||
except:
|
||||
print(f"等待页面加载超时: {url}")
|
||||
|
||||
|
||||
# 额外等待时间确保动态内容加载完成
|
||||
time.sleep(3)
|
||||
|
||||
|
||||
# 获取页面源码
|
||||
page_source = driver.page_source
|
||||
return page_source
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"Selenium获取页面失败: {url}, 错误: {e}")
|
||||
return None
|
||||
@@ -78,6 +80,7 @@ def get_page_with_selenium(url, website_name):
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
def download_media(url, save_dir):
|
||||
try:
|
||||
# 添加请求头以避免403 Forbidden错误
|
||||
@@ -236,7 +239,7 @@ def process_article(url, website):
|
||||
need_selenium = False
|
||||
if any(name in website.name for name in ["学习强国", "xuexi", "法治日报", "legaldaily"]):
|
||||
need_selenium = True
|
||||
|
||||
|
||||
try:
|
||||
if need_selenium:
|
||||
# 使用Selenium获取动态加载的内容
|
||||
@@ -244,28 +247,28 @@ def process_article(url, website):
|
||||
if not page_source:
|
||||
print(f"Selenium获取页面失败:{url}")
|
||||
return
|
||||
|
||||
|
||||
# 检查页面内容是否过短
|
||||
min_length = 100 if "法治日报" in website.name else 300
|
||||
if len(page_source) < min_length:
|
||||
print(f"页面内容过短,可能是重定向页面:{url}")
|
||||
return
|
||||
|
||||
|
||||
# 创建BeautifulSoup对象
|
||||
soup = BeautifulSoup(page_source, "html.parser")
|
||||
else:
|
||||
# 使用requests获取静态内容
|
||||
resp = requests.get(url, headers=headers, timeout=15, allow_redirects=True)
|
||||
resp.raise_for_status()
|
||||
|
||||
|
||||
# 检查是否是重定向页面
|
||||
if len(resp.text) < 300:
|
||||
print(f"页面内容过短,可能是重定向页面:{url}")
|
||||
return
|
||||
|
||||
|
||||
# 创建BeautifulSoup对象
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"请求失败:{url},错误:{e}")
|
||||
return
|
||||
@@ -353,7 +356,7 @@ def process_article(url, website):
|
||||
heading_text = heading.get_text(strip=True)
|
||||
if title_text in heading_text or heading_text in title_text:
|
||||
heading.decompose()
|
||||
|
||||
|
||||
# 移除class包含title的元素
|
||||
for title_element in content_tag.find_all(class_=lambda x: x and "title" in x):
|
||||
title_element_text = title_element.get_text(strip=True)
|
||||
@@ -489,13 +492,13 @@ def process_article(url, website):
|
||||
soup.find("p", class_="title") or
|
||||
soup.find("title")
|
||||
)
|
||||
|
||||
|
||||
# 针对求是的特殊处理,如果标题为空或太短,尝试从title标签提取
|
||||
if title_tag:
|
||||
title_text = title_tag.get_text(strip=True)
|
||||
if not title_text or len(title_text) < 5:
|
||||
title_tag = soup.find("title")
|
||||
|
||||
|
||||
# 针对求是的特殊处理,确保标题被正确提取
|
||||
if not title_tag or not title_tag.get_text(strip=True):
|
||||
title_tag = soup.find("title")
|
||||
@@ -522,7 +525,7 @@ def process_article(url, website):
|
||||
# 如果 strong 在正文前两段内,就删除
|
||||
if parent_p in content_tag.find_all("p")[:2]:
|
||||
strong_tag.decompose()
|
||||
|
||||
|
||||
# 移除h1、h2、h3标题元素中的重复标题
|
||||
for heading in content_tag.find_all(["h1", "h2", "h3"]):
|
||||
heading_text = heading.get_text(strip=True)
|
||||
@@ -530,11 +533,12 @@ def process_article(url, website):
|
||||
# 确保不删除title_tag本身
|
||||
if heading != title_tag:
|
||||
heading.decompose()
|
||||
|
||||
|
||||
# 移除class包含title的元素
|
||||
for title_element in content_tag.find_all(class_=lambda x: x and "title" in x):
|
||||
title_element_text = title_element.get_text(strip=True)
|
||||
if title_element_text and (title_text in title_element_text or title_element_text in title_text):
|
||||
if title_element_text and (
|
||||
title_text in title_element_text or title_element_text in title_text):
|
||||
# 确保不删除title_tag本身
|
||||
if title_element != title_tag:
|
||||
title_element.decompose()
|
||||
@@ -583,7 +587,7 @@ def process_article(url, website):
|
||||
soup.find("h2") or # 解放军报使用h2标签作为标题
|
||||
soup.find("title")
|
||||
)
|
||||
|
||||
|
||||
# 针对解放军报的特殊处理,如果标题为空或太短,尝试从title标签提取
|
||||
if title_tag:
|
||||
title_text = title_tag.get_text(strip=True)
|
||||
@@ -606,34 +610,34 @@ def process_article(url, website):
|
||||
# 移除面包屑导航
|
||||
for breadcrumb in content_tag.find_all("ol", class_="breadcrumb"):
|
||||
breadcrumb.decompose()
|
||||
|
||||
|
||||
# 移除分享相关元素
|
||||
for share_element in content_tag.find_all("div", class_="share-custom"):
|
||||
share_element.decompose()
|
||||
|
||||
|
||||
# 移除作者信息段落
|
||||
for author_p in content_tag.find_all("p"):
|
||||
text = author_p.get_text(strip=True)
|
||||
if "来源:" in text or "作者:" in text or "责任编辑:" in text or "发布:" in text:
|
||||
author_p.decompose()
|
||||
|
||||
|
||||
# 移除进度条
|
||||
for progress in content_tag.find_all("div", class_="progress-bar"):
|
||||
progress.decompose()
|
||||
|
||||
|
||||
# 移除播放器
|
||||
for player in content_tag.find_all("div", class_="player"):
|
||||
player.decompose()
|
||||
|
||||
|
||||
# 移除媒体URL容器
|
||||
for media in content_tag.find_all("div", id="mediaurl"):
|
||||
media.decompose()
|
||||
|
||||
|
||||
# 移除新闻列表(但保留其中的内容)
|
||||
for news_list in content_tag.find_all("ul", id="main-news-list"):
|
||||
# 不删除整个ul,而是unwrap它,保留其中的内容
|
||||
news_list.unwrap()
|
||||
|
||||
|
||||
# 移除编辑信息
|
||||
for editor_element in content_tag.find_all("div", class_="editor"):
|
||||
editor_element.decompose()
|
||||
@@ -744,13 +748,13 @@ def process_article(url, website):
|
||||
soup.find("h1") or
|
||||
soup.find("title")
|
||||
)
|
||||
|
||||
|
||||
# 针对工人日报的特殊处理,如果标题为空或太短,尝试从title标签提取
|
||||
if title_tag:
|
||||
title_text = title_tag.get_text(strip=True)
|
||||
if not title_text or len(title_text) < 5:
|
||||
title_tag = soup.find("title")
|
||||
|
||||
|
||||
# 进一步处理:如果h1标题包含太多无关信息,尝试从title标签提取更简洁的标题
|
||||
if title_tag and title_tag.name == 'h1':
|
||||
title_text = title_tag.get_text(strip=True)
|
||||
@@ -877,11 +881,11 @@ def process_article(url, website):
|
||||
soup.find("h1") or
|
||||
soup.find("title")
|
||||
)
|
||||
|
||||
|
||||
# 针对中国纪检监察报的特殊处理,确保标题被正确提取
|
||||
if not title_tag or not title_tag.get_text(strip=True):
|
||||
title_tag = soup.find("title")
|
||||
|
||||
|
||||
content_tag = (
|
||||
soup.find("div", class_="content") or
|
||||
soup.find("div", class_="article-content") or
|
||||
@@ -955,11 +959,11 @@ def process_article(url, website):
|
||||
soup.find("h1") or
|
||||
soup.find("title")
|
||||
)
|
||||
|
||||
|
||||
# 针对中国青年报的特殊处理,确保标题被正确提取
|
||||
if not title_tag or not title_tag.get_text(strip=True):
|
||||
title_tag = soup.find("title")
|
||||
|
||||
|
||||
content_tag = (
|
||||
soup.find("div", class_="main") or # 中国青年报特有内容容器
|
||||
soup.find("div", class_="content") or
|
||||
@@ -977,11 +981,11 @@ def process_article(url, website):
|
||||
soup.find("h1") or
|
||||
soup.find("title")
|
||||
)
|
||||
|
||||
|
||||
# 针对中国妇女报的特殊处理,确保标题被正确提取
|
||||
if not title_tag or not title_tag.get_text(strip=True):
|
||||
title_tag = soup.find("title")
|
||||
|
||||
|
||||
content_tag = (
|
||||
soup.find("div", class_="main") or # 中国妇女报特有内容容器
|
||||
soup.find("div", class_="news") or # 中国妇女报特有内容容器
|
||||
@@ -1001,11 +1005,11 @@ def process_article(url, website):
|
||||
soup.find("h1") or
|
||||
soup.find("title")
|
||||
)
|
||||
|
||||
|
||||
# 针对法治日报的特殊处理,确保标题被正确提取
|
||||
if not title_tag or not title_tag.get_text(strip=True):
|
||||
title_tag = soup.find("title")
|
||||
|
||||
|
||||
content_tag = (
|
||||
soup.find("div", class_="content-two") or # 优先查找content-two类
|
||||
soup.find("div", class_="article-content") or # 法治日报特有内容容器
|
||||
@@ -1058,13 +1062,13 @@ def process_article(url, website):
|
||||
soup.find("h1") or
|
||||
soup.find("title")
|
||||
)
|
||||
|
||||
|
||||
# 针对农民日报的特殊处理,如果标题出现乱码,尝试从title标签提取
|
||||
if title_tag and title_tag.name == 'h1':
|
||||
title_text = title_tag.get_text(strip=True)
|
||||
if title_text and any(char in title_text for char in ['', '', '']):
|
||||
title_tag = soup.find("title")
|
||||
|
||||
|
||||
# 针对农民日报的特殊处理,确保标题被正确提取
|
||||
if not title_tag or not title_tag.get_text(strip=True):
|
||||
title_tag = soup.find("title")
|
||||
@@ -1078,7 +1082,7 @@ def process_article(url, website):
|
||||
soup.find("div", class_="article") or
|
||||
soup.find("div", class_="article-body")
|
||||
)
|
||||
|
||||
|
||||
# 针对农民日报的特殊处理,如果找到多个detailCon,选择内容最长的那个
|
||||
if content_tag and content_tag.get('class') and 'detailCon' in content_tag.get('class', []):
|
||||
detail_cons = soup.find_all("div", class_="detailCon")
|
||||
@@ -1116,17 +1120,17 @@ def process_article(url, website):
|
||||
soup.find("h1") or
|
||||
soup.find("title")
|
||||
)
|
||||
|
||||
|
||||
# 针对学习强国的特殊处理,确保标题被正确提取
|
||||
if not title_tag or not title_tag.get_text(strip=True):
|
||||
title_tag = soup.find("title")
|
||||
|
||||
|
||||
# 针对学习强国的特殊处理,如果标题太短,尝试从title标签提取
|
||||
if title_tag:
|
||||
title_text = title_tag.get_text(strip=True)
|
||||
if title_text and len(title_text) < 10:
|
||||
title_tag = soup.find("title")
|
||||
|
||||
|
||||
content_tag = (
|
||||
soup.find("div", class_="content") or
|
||||
soup.find("div", class_="article-content") or
|
||||
@@ -1153,17 +1157,17 @@ def process_article(url, website):
|
||||
soup.find("h1") or
|
||||
soup.find("title")
|
||||
)
|
||||
|
||||
|
||||
# 针对旗帜网的特殊处理,如果标题为空或太短,尝试从title标签提取
|
||||
if title_tag:
|
||||
title_text = title_tag.get_text(strip=True)
|
||||
if not title_text or len(title_text) < 5:
|
||||
title_tag = soup.find("title")
|
||||
|
||||
|
||||
# 针对旗帜网的特殊处理,确保标题被正确提取
|
||||
if not title_tag or not title_tag.get_text(strip=True):
|
||||
title_tag = soup.find("title")
|
||||
|
||||
|
||||
# 针对旗帜网的特殊处理,如果标题太短,尝试从title标签提取
|
||||
if title_tag:
|
||||
title_text = title_tag.get_text(strip=True)
|
||||
@@ -1232,13 +1236,13 @@ def process_article(url, website):
|
||||
soup.find("h1") or
|
||||
soup.find("title")
|
||||
)
|
||||
|
||||
|
||||
# 针对中国网的特殊处理,如果标题为空或太短,尝试从title标签提取
|
||||
if title_tag:
|
||||
title_text = title_tag.get_text(strip=True)
|
||||
if not title_text or len(title_text) < 5:
|
||||
title_tag = soup.find("title")
|
||||
|
||||
|
||||
content_tag = (
|
||||
soup.find("div", class_="article") or # 中国网特有内容容器
|
||||
soup.find("div", class_="main") or
|
||||
@@ -1281,7 +1285,7 @@ def process_article(url, website):
|
||||
# 最终标题处理 - 只有在没有网站特定处理时才使用默认处理
|
||||
if not title_tag:
|
||||
title_tag = soup.find("h1") or soup.find("title")
|
||||
|
||||
|
||||
title = title_tag.get_text(strip=True) if title_tag else "无标题"
|
||||
|
||||
# 对标题进行额外处理,去除可能的多余空白字符
|
||||
@@ -1564,7 +1568,7 @@ def full_site_crawler(start_url, website, max_pages=1000):
|
||||
("/content/" in path) or
|
||||
(path.startswith("/detail/") and len(path) > 10)
|
||||
)
|
||||
|
||||
|
||||
# 排除列表页面
|
||||
if "/index.html" in path or path.endswith("/"):
|
||||
is_article_page = False
|
||||
|
||||
Reference in New Issue
Block a user