fix all bug
This commit is contained in:
238
core/utils.py
238
core/utils.py
@@ -7,8 +7,77 @@ from django.utils import timezone
|
|||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from core.models import Article
|
from core.models import Article
|
||||||
import re
|
import re
|
||||||
|
import time
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.chrome.service import Service
|
||||||
|
from selenium.webdriver.chrome.options import Options
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
|
from webdriver_manager.chrome import ChromeDriverManager
|
||||||
|
|
||||||
|
|
||||||
|
def get_selenium_driver():
|
||||||
|
"""获取Selenium WebDriver实例"""
|
||||||
|
try:
|
||||||
|
chrome_options = Options()
|
||||||
|
chrome_options.add_argument("--headless") # 无头模式
|
||||||
|
chrome_options.add_argument("--no-sandbox")
|
||||||
|
chrome_options.add_argument("--disable-dev-shm-usage")
|
||||||
|
chrome_options.add_argument("--disable-gpu")
|
||||||
|
chrome_options.add_argument("--window-size=1920,1080")
|
||||||
|
chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
|
||||||
|
|
||||||
|
service = Service(ChromeDriverManager().install())
|
||||||
|
driver = webdriver.Chrome(service=service, options=chrome_options)
|
||||||
|
return driver
|
||||||
|
except Exception as e:
|
||||||
|
print(f"创建Selenium WebDriver失败: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_page_with_selenium(url, website_name):
|
||||||
|
"""使用Selenium获取动态加载的页面内容"""
|
||||||
|
driver = None
|
||||||
|
try:
|
||||||
|
driver = get_selenium_driver()
|
||||||
|
if not driver:
|
||||||
|
return None
|
||||||
|
|
||||||
|
print(f"使用Selenium加载页面: {url}")
|
||||||
|
driver.get(url)
|
||||||
|
|
||||||
|
# 等待页面加载完成
|
||||||
|
wait_time = 10
|
||||||
|
if "学习强国" in website_name:
|
||||||
|
wait_time = 15 # 学习强国需要更长时间
|
||||||
|
elif "法治日报" in website_name:
|
||||||
|
wait_time = 12 # 法治日报需要较长时间
|
||||||
|
|
||||||
|
# 等待页面主要内容加载
|
||||||
|
try:
|
||||||
|
WebDriverWait(driver, wait_time).until(
|
||||||
|
EC.presence_of_element_located((By.TAG_NAME, "body"))
|
||||||
|
)
|
||||||
|
except:
|
||||||
|
print(f"等待页面加载超时: {url}")
|
||||||
|
|
||||||
|
# 额外等待时间确保动态内容加载完成
|
||||||
|
time.sleep(3)
|
||||||
|
|
||||||
|
# 获取页面源码
|
||||||
|
page_source = driver.page_source
|
||||||
|
return page_source
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Selenium获取页面失败: {url}, 错误: {e}")
|
||||||
|
return None
|
||||||
|
finally:
|
||||||
|
if driver:
|
||||||
|
try:
|
||||||
|
driver.quit()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
def download_media(url, save_dir):
|
def download_media(url, save_dir):
|
||||||
try:
|
try:
|
||||||
# 添加请求头以避免403 Forbidden错误
|
# 添加请求头以避免403 Forbidden错误
|
||||||
@@ -163,37 +232,68 @@ def process_article(url, website):
|
|||||||
"Referer": "http://www.qizhiwang.org.cn/"
|
"Referer": "http://www.qizhiwang.org.cn/"
|
||||||
})
|
})
|
||||||
|
|
||||||
try:
|
# 判断是否需要使用Selenium
|
||||||
resp = requests.get(url, headers=headers, timeout=15, allow_redirects=True)
|
need_selenium = False
|
||||||
resp.raise_for_status()
|
if any(name in website.name for name in ["学习强国", "xuexi", "法治日报", "legaldaily"]):
|
||||||
|
need_selenium = True
|
||||||
|
|
||||||
|
try:
|
||||||
|
if need_selenium:
|
||||||
|
# 使用Selenium获取动态加载的内容
|
||||||
|
page_source = get_page_with_selenium(url, website.name)
|
||||||
|
if not page_source:
|
||||||
|
print(f"Selenium获取页面失败:{url}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# 检查页面内容是否过短
|
||||||
|
min_length = 100 if "法治日报" in website.name else 300
|
||||||
|
if len(page_source) < min_length:
|
||||||
|
print(f"页面内容过短,可能是重定向页面:{url}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# 创建BeautifulSoup对象
|
||||||
|
soup = BeautifulSoup(page_source, "html.parser")
|
||||||
|
else:
|
||||||
|
# 使用requests获取静态内容
|
||||||
|
resp = requests.get(url, headers=headers, timeout=15, allow_redirects=True)
|
||||||
|
resp.raise_for_status()
|
||||||
|
|
||||||
|
# 检查是否是重定向页面
|
||||||
|
if len(resp.text) < 300:
|
||||||
|
print(f"页面内容过短,可能是重定向页面:{url}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# 创建BeautifulSoup对象
|
||||||
|
soup = BeautifulSoup(resp.text, "html.parser")
|
||||||
|
|
||||||
# 检查是否是重定向页面
|
|
||||||
if len(resp.text) < 500:
|
|
||||||
print(f"页面内容过短,可能是重定向页面:{url}")
|
|
||||||
return
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"请求失败:{url},错误:{e}")
|
print(f"请求失败:{url},错误:{e}")
|
||||||
return
|
return
|
||||||
|
|
||||||
# 针对不同网站设置正确的编码
|
# 针对不同网站设置正确的编码(仅对requests获取的内容)
|
||||||
if "人民网" in website.name or "人民日报" in website.name:
|
if not need_selenium:
|
||||||
resp.encoding = 'utf-8'
|
if "人民网" in website.name or "人民日报" in website.name:
|
||||||
elif "新华网" in website.name:
|
resp.encoding = 'utf-8'
|
||||||
resp.encoding = 'utf-8'
|
elif "新华网" in website.name:
|
||||||
elif "央视" in website.name or "CCTV" in website.name:
|
resp.encoding = 'utf-8'
|
||||||
resp.encoding = 'utf-8'
|
elif "央视" in website.name or "CCTV" in website.name:
|
||||||
elif "农民日报" in website.name or "farmer" in website.name:
|
resp.encoding = 'utf-8'
|
||||||
resp.encoding = 'utf-8'
|
elif "农民日报" in website.name or "farmer" in website.name:
|
||||||
# 尝试其他编码
|
resp.encoding = 'utf-8'
|
||||||
if '' in resp.text or len(resp.text) < 1000:
|
# 尝试其他编码
|
||||||
resp.encoding = 'gbk'
|
if '' in resp.text or len(resp.text) < 1000:
|
||||||
# 进一步尝试其他编码
|
resp.encoding = 'gbk'
|
||||||
if '' in resp.text or '' in resp.text:
|
# 进一步尝试其他编码
|
||||||
resp.encoding = 'gb2312'
|
if '' in resp.text or '' in resp.text:
|
||||||
else:
|
resp.encoding = 'gb2312'
|
||||||
resp.encoding = 'utf-8'
|
# 如果还是有问题,尝试更多编码
|
||||||
|
if '' in resp.text or '' in resp.text:
|
||||||
soup = BeautifulSoup(resp.text, "html.parser")
|
resp.encoding = 'utf-8-sig'
|
||||||
|
# 最后尝试
|
||||||
|
if '' in resp.text or '' in resp.text:
|
||||||
|
resp.encoding = 'big5'
|
||||||
|
else:
|
||||||
|
resp.encoding = 'utf-8'
|
||||||
|
|
||||||
# 处理不同网站的文章结构
|
# 处理不同网站的文章结构
|
||||||
if website.name == "新华网" or website.name == "www.news.cn" or "新华网" in website.name:
|
if website.name == "新华网" or website.name == "www.news.cn" or "新华网" in website.name:
|
||||||
@@ -777,6 +877,11 @@ def process_article(url, website):
|
|||||||
soup.find("h1") or
|
soup.find("h1") or
|
||||||
soup.find("title")
|
soup.find("title")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# 针对中国纪检监察报的特殊处理,确保标题被正确提取
|
||||||
|
if not title_tag or not title_tag.get_text(strip=True):
|
||||||
|
title_tag = soup.find("title")
|
||||||
|
|
||||||
content_tag = (
|
content_tag = (
|
||||||
soup.find("div", class_="content") or
|
soup.find("div", class_="content") or
|
||||||
soup.find("div", class_="article-content") or
|
soup.find("div", class_="article-content") or
|
||||||
@@ -784,7 +889,10 @@ def process_article(url, website):
|
|||||||
soup.find("div", class_="text") or
|
soup.find("div", class_="text") or
|
||||||
soup.find("div", class_="main-content") or
|
soup.find("div", class_="main-content") or
|
||||||
soup.find("div", class_="article") or
|
soup.find("div", class_="article") or
|
||||||
soup.find("div", class_="article-body")
|
soup.find("div", class_="article-body") or
|
||||||
|
soup.find("div", class_="main") or
|
||||||
|
soup.find("div", class_="detail") or
|
||||||
|
soup.find("div", class_="article_yt") # 中国纪检监察报特有内容容器
|
||||||
)
|
)
|
||||||
elif "中国新闻社" in website.name or "chinanews" in website.name:
|
elif "中国新闻社" in website.name or "chinanews" in website.name:
|
||||||
# 中国新闻社的文章结构处理 - 修复爬取非文章部分问题
|
# 中国新闻社的文章结构处理 - 修复爬取非文章部分问题
|
||||||
@@ -847,7 +955,13 @@ def process_article(url, website):
|
|||||||
soup.find("h1") or
|
soup.find("h1") or
|
||||||
soup.find("title")
|
soup.find("title")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# 针对中国青年报的特殊处理,确保标题被正确提取
|
||||||
|
if not title_tag or not title_tag.get_text(strip=True):
|
||||||
|
title_tag = soup.find("title")
|
||||||
|
|
||||||
content_tag = (
|
content_tag = (
|
||||||
|
soup.find("div", class_="main") or # 中国青年报特有内容容器
|
||||||
soup.find("div", class_="content") or
|
soup.find("div", class_="content") or
|
||||||
soup.find("div", class_="article-content") or
|
soup.find("div", class_="article-content") or
|
||||||
soup.find("div", id="content") or
|
soup.find("div", id="content") or
|
||||||
@@ -863,7 +977,14 @@ def process_article(url, website):
|
|||||||
soup.find("h1") or
|
soup.find("h1") or
|
||||||
soup.find("title")
|
soup.find("title")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# 针对中国妇女报的特殊处理,确保标题被正确提取
|
||||||
|
if not title_tag or not title_tag.get_text(strip=True):
|
||||||
|
title_tag = soup.find("title")
|
||||||
|
|
||||||
content_tag = (
|
content_tag = (
|
||||||
|
soup.find("div", class_="main") or # 中国妇女报特有内容容器
|
||||||
|
soup.find("div", class_="news") or # 中国妇女报特有内容容器
|
||||||
soup.find("div", class_="content") or
|
soup.find("div", class_="content") or
|
||||||
soup.find("div", class_="article-content") or
|
soup.find("div", class_="article-content") or
|
||||||
soup.find("div", id="content") or
|
soup.find("div", id="content") or
|
||||||
@@ -880,6 +1001,11 @@ def process_article(url, website):
|
|||||||
soup.find("h1") or
|
soup.find("h1") or
|
||||||
soup.find("title")
|
soup.find("title")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# 针对法治日报的特殊处理,确保标题被正确提取
|
||||||
|
if not title_tag or not title_tag.get_text(strip=True):
|
||||||
|
title_tag = soup.find("title")
|
||||||
|
|
||||||
content_tag = (
|
content_tag = (
|
||||||
soup.find("div", class_="content-two") or # 优先查找content-two类
|
soup.find("div", class_="content-two") or # 优先查找content-two类
|
||||||
soup.find("div", class_="article-content") or # 法治日报特有内容容器
|
soup.find("div", class_="article-content") or # 法治日报特有内容容器
|
||||||
@@ -888,7 +1014,10 @@ def process_article(url, website):
|
|||||||
soup.find("div", class_="text") or
|
soup.find("div", class_="text") or
|
||||||
soup.find("div", class_="main-content") or
|
soup.find("div", class_="main-content") or
|
||||||
soup.find("div", class_="article") or
|
soup.find("div", class_="article") or
|
||||||
soup.find("div", class_="article-body")
|
soup.find("div", class_="article-body") or
|
||||||
|
soup.find("div", class_="article-detail") or # 法治日报特有内容容器
|
||||||
|
soup.find("div", class_="detail-content") or # 法治日报特有内容容器
|
||||||
|
soup.find("div", class_="article-text") # 法治日报特有内容容器
|
||||||
)
|
)
|
||||||
|
|
||||||
# 针对法治日报的特殊处理,清理内容中的无关元素
|
# 针对法治日报的特殊处理,清理内容中的无关元素
|
||||||
@@ -935,6 +1064,10 @@ def process_article(url, website):
|
|||||||
title_text = title_tag.get_text(strip=True)
|
title_text = title_tag.get_text(strip=True)
|
||||||
if title_text and any(char in title_text for char in ['', '', '']):
|
if title_text and any(char in title_text for char in ['', '', '']):
|
||||||
title_tag = soup.find("title")
|
title_tag = soup.find("title")
|
||||||
|
|
||||||
|
# 针对农民日报的特殊处理,确保标题被正确提取
|
||||||
|
if not title_tag or not title_tag.get_text(strip=True):
|
||||||
|
title_tag = soup.find("title")
|
||||||
content_tag = (
|
content_tag = (
|
||||||
soup.find("div", class_="detailCon") or # 农民日报特有内容容器
|
soup.find("div", class_="detailCon") or # 农民日报特有内容容器
|
||||||
soup.find("div", class_="article-content") or
|
soup.find("div", class_="article-content") or
|
||||||
@@ -983,6 +1116,17 @@ def process_article(url, website):
|
|||||||
soup.find("h1") or
|
soup.find("h1") or
|
||||||
soup.find("title")
|
soup.find("title")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# 针对学习强国的特殊处理,确保标题被正确提取
|
||||||
|
if not title_tag or not title_tag.get_text(strip=True):
|
||||||
|
title_tag = soup.find("title")
|
||||||
|
|
||||||
|
# 针对学习强国的特殊处理,如果标题太短,尝试从title标签提取
|
||||||
|
if title_tag:
|
||||||
|
title_text = title_tag.get_text(strip=True)
|
||||||
|
if title_text and len(title_text) < 10:
|
||||||
|
title_tag = soup.find("title")
|
||||||
|
|
||||||
content_tag = (
|
content_tag = (
|
||||||
soup.find("div", class_="content") or
|
soup.find("div", class_="content") or
|
||||||
soup.find("div", class_="article-content") or
|
soup.find("div", class_="article-content") or
|
||||||
@@ -990,7 +1134,14 @@ def process_article(url, website):
|
|||||||
soup.find("div", class_="text") or
|
soup.find("div", class_="text") or
|
||||||
soup.find("div", class_="main-content") or
|
soup.find("div", class_="main-content") or
|
||||||
soup.find("div", class_="article") or
|
soup.find("div", class_="article") or
|
||||||
soup.find("div", class_="article-body")
|
soup.find("div", class_="article-body") or
|
||||||
|
soup.find("div", class_="main") or
|
||||||
|
soup.find("div", class_="detail") or
|
||||||
|
soup.find("div", class_="lgpage-detail") or # 学习强国特有内容容器
|
||||||
|
soup.find("div", class_="detail-content") or # 学习强国特有内容容器
|
||||||
|
soup.find("div", class_="article-detail") or # 学习强国特有内容容器
|
||||||
|
soup.find("div", class_="xuexi") or # 学习强国特有内容容器
|
||||||
|
soup.find("div", class_="kNews") # 学习强国特有内容容器
|
||||||
)
|
)
|
||||||
elif "旗帜网" in website.name or "qizhiwang" in website.name:
|
elif "旗帜网" in website.name or "qizhiwang" in website.name:
|
||||||
# 旗帜网的文章结构处理 - 修复不保存正文和图片问题
|
# 旗帜网的文章结构处理 - 修复不保存正文和图片问题
|
||||||
@@ -1002,6 +1153,22 @@ def process_article(url, website):
|
|||||||
soup.find("h1") or
|
soup.find("h1") or
|
||||||
soup.find("title")
|
soup.find("title")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# 针对旗帜网的特殊处理,如果标题为空或太短,尝试从title标签提取
|
||||||
|
if title_tag:
|
||||||
|
title_text = title_tag.get_text(strip=True)
|
||||||
|
if not title_text or len(title_text) < 5:
|
||||||
|
title_tag = soup.find("title")
|
||||||
|
|
||||||
|
# 针对旗帜网的特殊处理,确保标题被正确提取
|
||||||
|
if not title_tag or not title_tag.get_text(strip=True):
|
||||||
|
title_tag = soup.find("title")
|
||||||
|
|
||||||
|
# 针对旗帜网的特殊处理,如果标题太短,尝试从title标签提取
|
||||||
|
if title_tag:
|
||||||
|
title_text = title_tag.get_text(strip=True)
|
||||||
|
if title_text and len(title_text) < 10:
|
||||||
|
title_tag = soup.find("title")
|
||||||
content_tag = (
|
content_tag = (
|
||||||
soup.find("div", class_="w1200 flag-text-con clearfix") or # 旗帜网特有内容容器
|
soup.find("div", class_="w1200 flag-text-con clearfix") or # 旗帜网特有内容容器
|
||||||
soup.find("div", class_="article-content") or # 旗帜网特有内容容器
|
soup.find("div", class_="article-content") or # 旗帜网特有内容容器
|
||||||
@@ -1065,14 +1232,23 @@ def process_article(url, website):
|
|||||||
soup.find("h1") or
|
soup.find("h1") or
|
||||||
soup.find("title")
|
soup.find("title")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# 针对中国网的特殊处理,如果标题为空或太短,尝试从title标签提取
|
||||||
|
if title_tag:
|
||||||
|
title_text = title_tag.get_text(strip=True)
|
||||||
|
if not title_text or len(title_text) < 5:
|
||||||
|
title_tag = soup.find("title")
|
||||||
|
|
||||||
content_tag = (
|
content_tag = (
|
||||||
soup.find("div", class_="main") or # 中国网特有内容容器
|
soup.find("div", class_="article") or # 中国网特有内容容器
|
||||||
|
soup.find("div", class_="main") or
|
||||||
|
soup.find("div", class_="textBox") or # 中国网直播特有内容容器
|
||||||
|
soup.find("div", class_="artInfo") or # 中国网直播特有内容容器
|
||||||
soup.find("div", class_="article-content") or
|
soup.find("div", class_="article-content") or
|
||||||
soup.find("div", class_="content") or
|
soup.find("div", class_="content") or
|
||||||
soup.find("div", id="content") or
|
soup.find("div", id="content") or
|
||||||
soup.find("div", class_="text") or
|
soup.find("div", class_="text") or
|
||||||
soup.find("div", class_="main-content") or
|
soup.find("div", class_="main-content") or
|
||||||
soup.find("div", class_="article") or
|
|
||||||
soup.find("div", class_="article-body")
|
soup.find("div", class_="article-body")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user