fix all bug

This commit is contained in:
2025-08-15 05:58:16 +08:00
parent c4dfc515f7
commit e82b85f4dd

View File

@@ -7,8 +7,77 @@ from django.utils import timezone
from django.conf import settings from django.conf import settings
from core.models import Article from core.models import Article
import re import re
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
def get_selenium_driver():
"""获取Selenium WebDriver实例"""
try:
chrome_options = Options()
chrome_options.add_argument("--headless") # 无头模式
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)
return driver
except Exception as e:
print(f"创建Selenium WebDriver失败: {e}")
return None
def get_page_with_selenium(url, website_name):
"""使用Selenium获取动态加载的页面内容"""
driver = None
try:
driver = get_selenium_driver()
if not driver:
return None
print(f"使用Selenium加载页面: {url}")
driver.get(url)
# 等待页面加载完成
wait_time = 10
if "学习强国" in website_name:
wait_time = 15 # 学习强国需要更长时间
elif "法治日报" in website_name:
wait_time = 12 # 法治日报需要较长时间
# 等待页面主要内容加载
try:
WebDriverWait(driver, wait_time).until(
EC.presence_of_element_located((By.TAG_NAME, "body"))
)
except:
print(f"等待页面加载超时: {url}")
# 额外等待时间确保动态内容加载完成
time.sleep(3)
# 获取页面源码
page_source = driver.page_source
return page_source
except Exception as e:
print(f"Selenium获取页面失败: {url}, 错误: {e}")
return None
finally:
if driver:
try:
driver.quit()
except:
pass
def download_media(url, save_dir): def download_media(url, save_dir):
try: try:
# 添加请求头以避免403 Forbidden错误 # 添加请求头以避免403 Forbidden错误
@@ -163,37 +232,68 @@ def process_article(url, website):
"Referer": "http://www.qizhiwang.org.cn/" "Referer": "http://www.qizhiwang.org.cn/"
}) })
try: # 判断是否需要使用Selenium
resp = requests.get(url, headers=headers, timeout=15, allow_redirects=True) need_selenium = False
resp.raise_for_status() if any(name in website.name for name in ["学习强国", "xuexi", "法治日报", "legaldaily"]):
need_selenium = True
try:
if need_selenium:
# 使用Selenium获取动态加载的内容
page_source = get_page_with_selenium(url, website.name)
if not page_source:
print(f"Selenium获取页面失败{url}")
return
# 检查页面内容是否过短
min_length = 100 if "法治日报" in website.name else 300
if len(page_source) < min_length:
print(f"页面内容过短,可能是重定向页面:{url}")
return
# 创建BeautifulSoup对象
soup = BeautifulSoup(page_source, "html.parser")
else:
# 使用requests获取静态内容
resp = requests.get(url, headers=headers, timeout=15, allow_redirects=True)
resp.raise_for_status()
# 检查是否是重定向页面
if len(resp.text) < 300:
print(f"页面内容过短,可能是重定向页面:{url}")
return
# 创建BeautifulSoup对象
soup = BeautifulSoup(resp.text, "html.parser")
# 检查是否是重定向页面
if len(resp.text) < 500:
print(f"页面内容过短,可能是重定向页面:{url}")
return
except Exception as e: except Exception as e:
print(f"请求失败:{url},错误:{e}") print(f"请求失败:{url},错误:{e}")
return return
# 针对不同网站设置正确的编码 # 针对不同网站设置正确的编码仅对requests获取的内容
if "人民网" in website.name or "人民日报" in website.name: if not need_selenium:
resp.encoding = 'utf-8' if "人民网" in website.name or "人民日报" in website.name:
elif "新华网" in website.name: resp.encoding = 'utf-8'
resp.encoding = 'utf-8' elif "新华网" in website.name:
elif "央视" in website.name or "CCTV" in website.name: resp.encoding = 'utf-8'
resp.encoding = 'utf-8' elif "央视" in website.name or "CCTV" in website.name:
elif "农民日报" in website.name or "farmer" in website.name: resp.encoding = 'utf-8'
resp.encoding = 'utf-8' elif "农民日报" in website.name or "farmer" in website.name:
# 尝试其他编码 resp.encoding = 'utf-8'
if '' in resp.text or len(resp.text) < 1000: # 尝试其他编码
resp.encoding = 'gbk' if '' in resp.text or len(resp.text) < 1000:
# 进一步尝试其他编码 resp.encoding = 'gbk'
if '' in resp.text or '' in resp.text: # 进一步尝试其他编码
resp.encoding = 'gb2312' if '' in resp.text or '' in resp.text:
else: resp.encoding = 'gb2312'
resp.encoding = 'utf-8' # 如果还是有问题,尝试更多编码
if '' in resp.text or '' in resp.text:
soup = BeautifulSoup(resp.text, "html.parser") resp.encoding = 'utf-8-sig'
# 最后尝试
if '' in resp.text or '' in resp.text:
resp.encoding = 'big5'
else:
resp.encoding = 'utf-8'
# 处理不同网站的文章结构 # 处理不同网站的文章结构
if website.name == "新华网" or website.name == "www.news.cn" or "新华网" in website.name: if website.name == "新华网" or website.name == "www.news.cn" or "新华网" in website.name:
@@ -777,6 +877,11 @@ def process_article(url, website):
soup.find("h1") or soup.find("h1") or
soup.find("title") soup.find("title")
) )
# 针对中国纪检监察报的特殊处理,确保标题被正确提取
if not title_tag or not title_tag.get_text(strip=True):
title_tag = soup.find("title")
content_tag = ( content_tag = (
soup.find("div", class_="content") or soup.find("div", class_="content") or
soup.find("div", class_="article-content") or soup.find("div", class_="article-content") or
@@ -784,7 +889,10 @@ def process_article(url, website):
soup.find("div", class_="text") or soup.find("div", class_="text") or
soup.find("div", class_="main-content") or soup.find("div", class_="main-content") or
soup.find("div", class_="article") or soup.find("div", class_="article") or
soup.find("div", class_="article-body") soup.find("div", class_="article-body") or
soup.find("div", class_="main") or
soup.find("div", class_="detail") or
soup.find("div", class_="article_yt") # 中国纪检监察报特有内容容器
) )
elif "中国新闻社" in website.name or "chinanews" in website.name: elif "中国新闻社" in website.name or "chinanews" in website.name:
# 中国新闻社的文章结构处理 - 修复爬取非文章部分问题 # 中国新闻社的文章结构处理 - 修复爬取非文章部分问题
@@ -847,7 +955,13 @@ def process_article(url, website):
soup.find("h1") or soup.find("h1") or
soup.find("title") soup.find("title")
) )
# 针对中国青年报的特殊处理,确保标题被正确提取
if not title_tag or not title_tag.get_text(strip=True):
title_tag = soup.find("title")
content_tag = ( content_tag = (
soup.find("div", class_="main") or # 中国青年报特有内容容器
soup.find("div", class_="content") or soup.find("div", class_="content") or
soup.find("div", class_="article-content") or soup.find("div", class_="article-content") or
soup.find("div", id="content") or soup.find("div", id="content") or
@@ -863,7 +977,14 @@ def process_article(url, website):
soup.find("h1") or soup.find("h1") or
soup.find("title") soup.find("title")
) )
# 针对中国妇女报的特殊处理,确保标题被正确提取
if not title_tag or not title_tag.get_text(strip=True):
title_tag = soup.find("title")
content_tag = ( content_tag = (
soup.find("div", class_="main") or # 中国妇女报特有内容容器
soup.find("div", class_="news") or # 中国妇女报特有内容容器
soup.find("div", class_="content") or soup.find("div", class_="content") or
soup.find("div", class_="article-content") or soup.find("div", class_="article-content") or
soup.find("div", id="content") or soup.find("div", id="content") or
@@ -880,6 +1001,11 @@ def process_article(url, website):
soup.find("h1") or soup.find("h1") or
soup.find("title") soup.find("title")
) )
# 针对法治日报的特殊处理,确保标题被正确提取
if not title_tag or not title_tag.get_text(strip=True):
title_tag = soup.find("title")
content_tag = ( content_tag = (
soup.find("div", class_="content-two") or # 优先查找content-two类 soup.find("div", class_="content-two") or # 优先查找content-two类
soup.find("div", class_="article-content") or # 法治日报特有内容容器 soup.find("div", class_="article-content") or # 法治日报特有内容容器
@@ -888,7 +1014,10 @@ def process_article(url, website):
soup.find("div", class_="text") or soup.find("div", class_="text") or
soup.find("div", class_="main-content") or soup.find("div", class_="main-content") or
soup.find("div", class_="article") or soup.find("div", class_="article") or
soup.find("div", class_="article-body") soup.find("div", class_="article-body") or
soup.find("div", class_="article-detail") or # 法治日报特有内容容器
soup.find("div", class_="detail-content") or # 法治日报特有内容容器
soup.find("div", class_="article-text") # 法治日报特有内容容器
) )
# 针对法治日报的特殊处理,清理内容中的无关元素 # 针对法治日报的特殊处理,清理内容中的无关元素
@@ -935,6 +1064,10 @@ def process_article(url, website):
title_text = title_tag.get_text(strip=True) title_text = title_tag.get_text(strip=True)
if title_text and any(char in title_text for char in ['', '', '']): if title_text and any(char in title_text for char in ['', '', '']):
title_tag = soup.find("title") title_tag = soup.find("title")
# 针对农民日报的特殊处理,确保标题被正确提取
if not title_tag or not title_tag.get_text(strip=True):
title_tag = soup.find("title")
content_tag = ( content_tag = (
soup.find("div", class_="detailCon") or # 农民日报特有内容容器 soup.find("div", class_="detailCon") or # 农民日报特有内容容器
soup.find("div", class_="article-content") or soup.find("div", class_="article-content") or
@@ -983,6 +1116,17 @@ def process_article(url, website):
soup.find("h1") or soup.find("h1") or
soup.find("title") soup.find("title")
) )
# 针对学习强国的特殊处理,确保标题被正确提取
if not title_tag or not title_tag.get_text(strip=True):
title_tag = soup.find("title")
# 针对学习强国的特殊处理如果标题太短尝试从title标签提取
if title_tag:
title_text = title_tag.get_text(strip=True)
if title_text and len(title_text) < 10:
title_tag = soup.find("title")
content_tag = ( content_tag = (
soup.find("div", class_="content") or soup.find("div", class_="content") or
soup.find("div", class_="article-content") or soup.find("div", class_="article-content") or
@@ -990,7 +1134,14 @@ def process_article(url, website):
soup.find("div", class_="text") or soup.find("div", class_="text") or
soup.find("div", class_="main-content") or soup.find("div", class_="main-content") or
soup.find("div", class_="article") or soup.find("div", class_="article") or
soup.find("div", class_="article-body") soup.find("div", class_="article-body") or
soup.find("div", class_="main") or
soup.find("div", class_="detail") or
soup.find("div", class_="lgpage-detail") or # 学习强国特有内容容器
soup.find("div", class_="detail-content") or # 学习强国特有内容容器
soup.find("div", class_="article-detail") or # 学习强国特有内容容器
soup.find("div", class_="xuexi") or # 学习强国特有内容容器
soup.find("div", class_="kNews") # 学习强国特有内容容器
) )
elif "旗帜网" in website.name or "qizhiwang" in website.name: elif "旗帜网" in website.name or "qizhiwang" in website.name:
# 旗帜网的文章结构处理 - 修复不保存正文和图片问题 # 旗帜网的文章结构处理 - 修复不保存正文和图片问题
@@ -1002,6 +1153,22 @@ def process_article(url, website):
soup.find("h1") or soup.find("h1") or
soup.find("title") soup.find("title")
) )
# 针对旗帜网的特殊处理如果标题为空或太短尝试从title标签提取
if title_tag:
title_text = title_tag.get_text(strip=True)
if not title_text or len(title_text) < 5:
title_tag = soup.find("title")
# 针对旗帜网的特殊处理,确保标题被正确提取
if not title_tag or not title_tag.get_text(strip=True):
title_tag = soup.find("title")
# 针对旗帜网的特殊处理如果标题太短尝试从title标签提取
if title_tag:
title_text = title_tag.get_text(strip=True)
if title_text and len(title_text) < 10:
title_tag = soup.find("title")
content_tag = ( content_tag = (
soup.find("div", class_="w1200 flag-text-con clearfix") or # 旗帜网特有内容容器 soup.find("div", class_="w1200 flag-text-con clearfix") or # 旗帜网特有内容容器
soup.find("div", class_="article-content") or # 旗帜网特有内容容器 soup.find("div", class_="article-content") or # 旗帜网特有内容容器
@@ -1065,14 +1232,23 @@ def process_article(url, website):
soup.find("h1") or soup.find("h1") or
soup.find("title") soup.find("title")
) )
# 针对中国网的特殊处理如果标题为空或太短尝试从title标签提取
if title_tag:
title_text = title_tag.get_text(strip=True)
if not title_text or len(title_text) < 5:
title_tag = soup.find("title")
content_tag = ( content_tag = (
soup.find("div", class_="main") or # 中国网特有内容容器 soup.find("div", class_="article") or # 中国网特有内容容器
soup.find("div", class_="main") or
soup.find("div", class_="textBox") or # 中国网直播特有内容容器
soup.find("div", class_="artInfo") or # 中国网直播特有内容容器
soup.find("div", class_="article-content") or soup.find("div", class_="article-content") or
soup.find("div", class_="content") or soup.find("div", class_="content") or
soup.find("div", id="content") or soup.find("div", id="content") or
soup.find("div", class_="text") or soup.find("div", class_="text") or
soup.find("div", class_="main-content") or soup.find("div", class_="main-content") or
soup.find("div", class_="article") or
soup.find("div", class_="article-body") soup.find("div", class_="article-body")
) )