1916 lines
88 KiB
Python
1916 lines
88 KiB
Python
import os
|
||
import requests
|
||
from bs4 import BeautifulSoup
|
||
from urllib.parse import urljoin, urlparse
|
||
from collections import deque
|
||
from django.utils import timezone
|
||
from django.conf import settings
|
||
from core.models import Article
|
||
import re
|
||
import time
|
||
from selenium import webdriver
|
||
from selenium.webdriver.chrome.service import Service
|
||
from selenium.webdriver.chrome.options import Options
|
||
from selenium.webdriver.common.by import By
|
||
from selenium.webdriver.support.ui import WebDriverWait
|
||
from selenium.webdriver.support import expected_conditions as EC
|
||
from webdriver_manager.chrome import ChromeDriverManager
|
||
|
||
|
||
def get_selenium_driver():
|
||
"""获取Selenium WebDriver实例"""
|
||
try:
|
||
chrome_options = Options()
|
||
chrome_options.add_argument("--headless") # 无头模式
|
||
chrome_options.add_argument("--no-sandbox")
|
||
chrome_options.add_argument("--disable-dev-shm-usage")
|
||
chrome_options.add_argument("--disable-gpu")
|
||
chrome_options.add_argument("--window-size=1920,1080")
|
||
chrome_options.add_argument(
|
||
"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
|
||
|
||
service = Service(ChromeDriverManager().install())
|
||
driver = webdriver.Chrome(service=service, options=chrome_options)
|
||
return driver
|
||
except Exception as e:
|
||
print(f"创建Selenium WebDriver失败: {e}")
|
||
return None
|
||
|
||
|
||
def get_page_with_selenium(url, website_name):
|
||
"""使用Selenium获取动态加载的页面内容"""
|
||
driver = None
|
||
try:
|
||
driver = get_selenium_driver()
|
||
if not driver:
|
||
return None
|
||
|
||
print(f"使用Selenium加载页面: {url}")
|
||
driver.get(url)
|
||
|
||
# 等待页面加载完成
|
||
wait_time = 10
|
||
if "学习强国" in website_name:
|
||
wait_time = 15 # 学习强国需要更长时间
|
||
elif "法治日报" in website_name:
|
||
wait_time = 12 # 法治日报需要较长时间
|
||
|
||
# 等待页面主要内容加载
|
||
try:
|
||
WebDriverWait(driver, wait_time).until(
|
||
EC.presence_of_element_located((By.TAG_NAME, "body"))
|
||
)
|
||
except:
|
||
print(f"等待页面加载超时: {url}")
|
||
|
||
# 额外等待时间确保动态内容加载完成
|
||
time.sleep(3)
|
||
|
||
# 获取页面源码
|
||
page_source = driver.page_source
|
||
return page_source
|
||
|
||
except Exception as e:
|
||
print(f"Selenium获取页面失败: {url}, 错误: {e}")
|
||
return None
|
||
finally:
|
||
if driver:
|
||
try:
|
||
driver.quit()
|
||
except:
|
||
pass
|
||
|
||
|
||
def download_media(url, save_dir):
|
||
try:
|
||
# 添加请求头以避免403 Forbidden错误
|
||
headers = {
|
||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
||
"Referer": urljoin(url, "/")
|
||
}
|
||
resp = requests.get(url, timeout=15, headers=headers)
|
||
resp.raise_for_status()
|
||
except Exception as e:
|
||
print(f"下载失败:{url},错误:{e}")
|
||
return None
|
||
|
||
# 更安全地处理文件名,去除查询参数并处理特殊字符
|
||
parsed_url = urlparse(url)
|
||
filename = os.path.basename(parsed_url.path)
|
||
if not filename or '.' not in filename:
|
||
# 如果URL路径中没有有效的文件名,使用默认名称
|
||
filename = 'media_file'
|
||
|
||
# 清理文件名中的特殊字符
|
||
filename = re.sub(r'[^\w\-_\.]', '_', filename)
|
||
|
||
# 确保文件有扩展名
|
||
if '.' not in filename:
|
||
content_type = resp.headers.get('content-type', '')
|
||
if 'image/jpeg' in content_type:
|
||
filename += '.jpg'
|
||
elif 'image/png' in content_type:
|
||
filename += '.png'
|
||
elif 'image/gif' in content_type:
|
||
filename += '.gif'
|
||
elif 'video/mp4' in content_type:
|
||
filename += '.mp4'
|
||
elif 'video/avi' in content_type:
|
||
filename += '.avi'
|
||
elif 'video/quicktime' in content_type:
|
||
filename += '.mov'
|
||
else:
|
||
filename += '.bin' # 默认二进制扩展名
|
||
|
||
os.makedirs(save_dir, exist_ok=True)
|
||
filepath = os.path.join(save_dir, filename)
|
||
|
||
base, ext = os.path.splitext(filename)
|
||
counter = 1
|
||
while os.path.exists(filepath):
|
||
filename = f"{base}_{counter}{ext}"
|
||
filepath = os.path.join(save_dir, filename)
|
||
counter += 1
|
||
|
||
with open(filepath, "wb") as f:
|
||
f.write(resp.content)
|
||
return filepath
|
||
|
||
|
||
def process_article(url, website):
|
||
# 检查文章是否已存在,如果存在则跳过
|
||
if Article.objects.filter(url=url).exists():
|
||
print(f"文章已存在,跳过: {url}")
|
||
return
|
||
|
||
headers = {"User-Agent": "Mozilla/5.0"}
|
||
|
||
# 针对不同网站设置特殊的请求头
|
||
if "人民网" in website.name or "人民日报" in website.name:
|
||
headers.update({
|
||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
||
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
|
||
"Accept-Encoding": "gzip, deflate",
|
||
"Connection": "keep-alive",
|
||
"Upgrade-Insecure-Requests": "1"
|
||
})
|
||
# 添加光明日报的特殊请求头
|
||
elif "光明日报" in website.name or "gmw.cn" in website.name:
|
||
headers.update({
|
||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
||
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
|
||
"Accept-Encoding": "gzip, deflate",
|
||
"Connection": "keep-alive",
|
||
"Upgrade-Insecure-Requests": "1",
|
||
"Referer": "https://www.gmw.cn/"
|
||
})
|
||
# 添加央视网的特殊请求头
|
||
elif "央视" in website.name or "CCTV" in website.name or "cctv" in website.name:
|
||
headers.update({
|
||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
||
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
|
||
"Accept-Encoding": "gzip, deflate",
|
||
"Connection": "keep-alive",
|
||
"Upgrade-Insecure-Requests": "1",
|
||
"Referer": "https://news.cctv.com/"
|
||
})
|
||
# 添加中国网的特殊请求头
|
||
elif "中国网" in website.name or "china.com.cn" in website.name:
|
||
headers.update({
|
||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
||
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
|
||
"Accept-Encoding": "gzip, deflate",
|
||
"Connection": "keep-alive",
|
||
"Upgrade-Insecure-Requests": "1",
|
||
"Referer": "http://www.china.com.cn/"
|
||
})
|
||
# 添加法治日报的特殊请求头
|
||
elif "法治日报" in website.name or "legaldaily" in website.name:
|
||
headers.update({
|
||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
||
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
|
||
"Accept-Encoding": "gzip, deflate",
|
||
"Connection": "keep-alive",
|
||
"Upgrade-Insecure-Requests": "1",
|
||
"Referer": "http://www.legaldaily.com.cn/"
|
||
})
|
||
# 添加工人日报的特殊请求头
|
||
elif "工人日报" in website.name or "workercn" in website.name:
|
||
headers.update({
|
||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
||
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
|
||
"Accept-Encoding": "gzip, deflate",
|
||
"Connection": "keep-alive",
|
||
"Upgrade-Insecure-Requests": "1",
|
||
"Referer": "https://www.workercn.cn/"
|
||
})
|
||
# 添加农民日报的特殊请求头
|
||
elif "农民日报" in website.name or "farmer" in website.name:
|
||
headers.update({
|
||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
||
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
|
||
"Accept-Encoding": "gzip, deflate",
|
||
"Connection": "keep-alive",
|
||
"Upgrade-Insecure-Requests": "1",
|
||
"Referer": "https://www.farmer.com.cn/"
|
||
})
|
||
# 添加解放军报的特殊请求头
|
||
elif "解放军报" in website.name or "81.cn" in website.name:
|
||
headers.update({
|
||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
||
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
|
||
"Accept-Encoding": "gzip, deflate",
|
||
"Connection": "keep-alive",
|
||
"Upgrade-Insecure-Requests": "1",
|
||
"Referer": "http://www.81.cn/"
|
||
})
|
||
# 添加旗帜网的特殊请求头
|
||
elif "旗帜网" in website.name or "qizhiwang" in website.name:
|
||
headers.update({
|
||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
||
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
|
||
"Accept-Encoding": "gzip, deflate",
|
||
"Connection": "keep-alive",
|
||
"Upgrade-Insecure-Requests": "1",
|
||
"Referer": "http://www.qizhiwang.org.cn/"
|
||
})
|
||
|
||
# 判断是否需要使用Selenium
|
||
need_selenium = False
|
||
if any(name in website.name for name in ["学习强国", "xuexi", "法治日报", "legaldaily"]):
|
||
need_selenium = True
|
||
|
||
try:
|
||
if need_selenium:
|
||
# 使用Selenium获取动态加载的内容
|
||
page_source = get_page_with_selenium(url, website.name)
|
||
if not page_source:
|
||
print(f"Selenium获取页面失败:{url}")
|
||
return
|
||
|
||
# 检查页面内容是否过短
|
||
min_length = 100 if "法治日报" in website.name else 300
|
||
if len(page_source) < min_length:
|
||
print(f"页面内容过短,可能是重定向页面:{url}")
|
||
return
|
||
|
||
# 创建BeautifulSoup对象
|
||
soup = BeautifulSoup(page_source, "html.parser")
|
||
else:
|
||
# 使用requests获取静态内容
|
||
resp = requests.get(url, headers=headers, timeout=15, allow_redirects=True)
|
||
resp.raise_for_status()
|
||
|
||
# 检查是否是重定向页面
|
||
if len(resp.text) < 300:
|
||
print(f"页面内容过短,可能是重定向页面:{url}")
|
||
return
|
||
|
||
# 创建BeautifulSoup对象
|
||
soup = BeautifulSoup(resp.text, "html.parser")
|
||
|
||
except Exception as e:
|
||
print(f"请求失败:{url},错误:{e}")
|
||
return
|
||
|
||
# 针对不同网站设置正确的编码(仅对requests获取的内容)
|
||
if not need_selenium:
|
||
if "人民网" in website.name or "人民日报" in website.name:
|
||
resp.encoding = 'utf-8'
|
||
elif "新华网" in website.name:
|
||
resp.encoding = 'utf-8'
|
||
elif "央视" in website.name or "CCTV" in website.name:
|
||
resp.encoding = 'utf-8'
|
||
elif "农民日报" in website.name or "farmer" in website.name:
|
||
resp.encoding = 'utf-8'
|
||
# 尝试其他编码
|
||
if '' in resp.text or len(resp.text) < 1000:
|
||
resp.encoding = 'gbk'
|
||
# 进一步尝试其他编码
|
||
if '' in resp.text or '' in resp.text:
|
||
resp.encoding = 'gb2312'
|
||
# 如果还是有问题,尝试更多编码
|
||
if '' in resp.text or '' in resp.text:
|
||
resp.encoding = 'utf-8-sig'
|
||
# 最后尝试
|
||
if '' in resp.text or '' in resp.text:
|
||
resp.encoding = 'big5'
|
||
else:
|
||
resp.encoding = 'utf-8'
|
||
|
||
# 处理不同网站的文章结构
|
||
if website.name == "新华网" or website.name == "www.news.cn" or "新华网" in website.name:
|
||
# 新华网的文章结构处理 - 修复不保存文章内容问题
|
||
title_tag = (
|
||
soup.find("h1", class_="title") or
|
||
soup.find("h1") or
|
||
soup.find("div", class_="title") or
|
||
soup.find("title")
|
||
)
|
||
content_tag = (
|
||
soup.find("div", class_="article-content") or
|
||
soup.find("div", class_="content") or
|
||
soup.find("div", id="content") or
|
||
soup.find("div", class_="article") or
|
||
soup.find("div", class_="main-content") or
|
||
soup.find("span", id="detailContent") # 添加新华网特有的内容容器
|
||
)
|
||
elif website.name == "东方烟草报":
|
||
# 优化东方烟草报的标题提取逻辑,按优先级尝试多种选择器
|
||
title_tag = (
|
||
soup.find("h1", id="title") or # 特别针对带id="title"的h1标签
|
||
soup.find("h1") or # 主要标题标签
|
||
soup.find("title") or # 页面title标签
|
||
soup.find("div", class_="title") or # 某些页面可能使用div.title
|
||
soup.find("h2") # 备选标题标签
|
||
)
|
||
content_tag = soup.find("div", class_="content") # 东方烟草报的内容通常在div.content中
|
||
# 增加对另一种内容结构的支持
|
||
if not content_tag:
|
||
content_tag = soup.find("div", id="gallery")
|
||
# 再增加对新内容结构的支持
|
||
if not content_tag:
|
||
content_tag = soup.find("div", id="ContentText")
|
||
elif website.name == "www.gov.cn":
|
||
# 中国政府网的文章结构处理 - 修复标题重复问题
|
||
title_tag = (
|
||
soup.find("h1", class_="title") or
|
||
soup.find("h1") or
|
||
soup.find("title")
|
||
)
|
||
# 查找主要内容区域,通常在.mainBody或content中
|
||
content_tag = (
|
||
soup.find("div", class_="pages_content") or
|
||
soup.find("div", class_="article_con") or
|
||
soup.find("div", class_="content") or
|
||
soup.find("div", id="content") or
|
||
soup.find("div", class_="mainBody")
|
||
)
|
||
|
||
# 针对中国政府网的特殊处理,清理内容中的重复标题
|
||
if content_tag and title_tag:
|
||
title_text = title_tag.get_text(strip=True)
|
||
if title_text:
|
||
# 移除内容中的重复标题元素
|
||
for heading in content_tag.find_all(["h1", "h2", "h3"]):
|
||
heading_text = heading.get_text(strip=True)
|
||
if title_text in heading_text or heading_text in title_text:
|
||
heading.decompose()
|
||
|
||
# 移除class包含title的元素
|
||
for title_element in content_tag.find_all(class_=lambda x: x and "title" in x):
|
||
title_element_text = title_element.get_text(strip=True)
|
||
if title_text in title_element_text or title_element_text in title_text:
|
||
title_element.decompose()
|
||
|
||
# 移除编辑信息
|
||
for editor_element in content_tag.find_all("div", class_="editor"):
|
||
editor_element.decompose()
|
||
|
||
# 移除分享相关元素
|
||
for share_element in content_tag.find_all("div", class_="share"):
|
||
share_element.decompose()
|
||
|
||
# 移除script标签
|
||
for script_element in content_tag.find_all("script"):
|
||
script_element.decompose()
|
||
|
||
# 移除样式标签
|
||
for style_element in content_tag.find_all("style"):
|
||
style_element.decompose()
|
||
elif "人民日报" in website.name or "人民网" in website.name:
|
||
# 人民日报网站的文章结构处理 - 修复乱码和404问题
|
||
title_tag = (
|
||
soup.find("h1", class_="title") or
|
||
soup.find("h1") or
|
||
soup.find("title")
|
||
)
|
||
# 查找主要内容区域
|
||
content_tag = (
|
||
soup.find("div", class_="content") or
|
||
soup.find("div", class_="article-content") or
|
||
soup.find("div", id="content") or
|
||
soup.find("div", class_="text") or
|
||
soup.find("section", class_="content") or
|
||
soup.find("div", class_="article") or
|
||
soup.find("div", class_="rm_txt_con") or # 添加人民网特有的内容容器
|
||
soup.find("div", class_="text_c") # 添加新的内容容器
|
||
)
|
||
|
||
# 针对人民网的特殊处理,清理内容中的无关元素
|
||
if content_tag:
|
||
# 移除编辑信息
|
||
for editor_element in content_tag.find_all("div", class_="edit"):
|
||
editor_element.decompose()
|
||
|
||
# 移除分享相关元素
|
||
for share_element in content_tag.find_all("p", class_="paper_num"):
|
||
share_element.decompose()
|
||
|
||
# 移除无关的box_pic元素
|
||
for pic_element in content_tag.find_all("div", class_="box_pic"):
|
||
pic_element.decompose()
|
||
|
||
# 移除无关的zdfy元素
|
||
for zdfy_element in content_tag.find_all("div", class_="zdfy"):
|
||
zdfy_element.decompose()
|
||
|
||
# 移除无关的center元素
|
||
for center_element in content_tag.find_all("center"):
|
||
center_element.decompose()
|
||
|
||
# 移除无关的bza元素
|
||
for bza_element in content_tag.find_all("div", class_="bza"):
|
||
bza_element.decompose()
|
||
|
||
# 移除隐藏的无关元素
|
||
for hidden_element in content_tag.find_all(attrs={"style": "display: none;"}):
|
||
hidden_element.decompose()
|
||
|
||
# 移除相关专题
|
||
for related_element in content_tag.find_all("div", id="rwb_tjyd"):
|
||
related_element.decompose()
|
||
|
||
# 移除推荐阅读
|
||
for recommend_element in content_tag.find_all("div", class_="clearfix box_cai"):
|
||
recommend_element.decompose()
|
||
|
||
# 移除相关专题列表
|
||
for topic_element in content_tag.find_all("div", class_="clearfix text_like"):
|
||
topic_element.decompose()
|
||
elif "央视" in website.name or "CCTV" in website.name or "cctv" in website.name:
|
||
# 央视网站的文章结构处理 - 修复不保存正文和图片问题
|
||
title_tag = (
|
||
soup.find("h1", class_="title") or
|
||
soup.find("h1", class_="title_text") or # 央视网特有标题类
|
||
soup.find("h1") or
|
||
soup.find("title")
|
||
)
|
||
content_tag = (
|
||
soup.find("div", class_="content_area") or # 央视网特有内容容器
|
||
soup.find("div", class_="content") or
|
||
soup.find("div", class_="article-content") or
|
||
soup.find("div", id="content") or
|
||
soup.find("div", class_="text") or
|
||
soup.find("div", class_="main-content") or
|
||
soup.find("div", class_="article") or
|
||
soup.find("div", class_="article-body")
|
||
)
|
||
|
||
# 针对央视网的特殊处理,清理内容中的无关元素
|
||
if content_tag:
|
||
# 移除编辑信息
|
||
for editor_element in content_tag.find_all("div", class_="editor"):
|
||
editor_element.decompose()
|
||
|
||
# 移除分享相关元素
|
||
for share_element in content_tag.find_all("div", class_="share"):
|
||
share_element.decompose()
|
||
|
||
# 移除相关推荐
|
||
for recommend_element in content_tag.find_all("div", class_="related"):
|
||
recommend_element.decompose()
|
||
|
||
# 移除评论区域
|
||
for comment_element in content_tag.find_all("div", class_="comment"):
|
||
comment_element.decompose()
|
||
|
||
# 移除script标签
|
||
for script_element in content_tag.find_all("script"):
|
||
script_element.decompose()
|
||
|
||
# 移除样式标签
|
||
for style_element in content_tag.find_all("style"):
|
||
style_element.decompose()
|
||
elif "求是" in website.name:
|
||
# 求是网站的文章结构处理 - 修复标题和正文清理问题
|
||
title_tag = (
|
||
soup.find("h1", class_="title") or
|
||
soup.find("h2", class_="title") or
|
||
soup.find("h1") or
|
||
soup.find("h2") or
|
||
soup.find("p", class_="title") or
|
||
soup.find("title")
|
||
)
|
||
|
||
# 针对求是的特殊处理,如果标题为空或太短,尝试从title标签提取
|
||
if title_tag:
|
||
title_text = title_tag.get_text(strip=True)
|
||
if not title_text or len(title_text) < 5:
|
||
title_tag = soup.find("title")
|
||
|
||
# 针对求是的特殊处理,确保标题被正确提取
|
||
if not title_tag or not title_tag.get_text(strip=True):
|
||
title_tag = soup.find("title")
|
||
|
||
content_tag = (
|
||
soup.find("div", class_="content") or
|
||
soup.find("div", class_="article-content") or
|
||
soup.find("div", id="content") or
|
||
soup.find("div", class_="text") or
|
||
soup.find("div", class_="main-content") or
|
||
soup.find("div", class_="article")
|
||
)
|
||
|
||
if content_tag:
|
||
# 移除重复标题(放宽匹配条件,允许部分匹配)
|
||
if title_tag:
|
||
title_text = title_tag.get_text(strip=True)
|
||
if title_text:
|
||
# 移除所有可能的重复标题元素
|
||
for strong_tag in content_tag.find_all("strong"):
|
||
strong_text = strong_tag.get_text(strip=True)
|
||
if strong_text and (title_text in strong_text or strong_text in title_text):
|
||
parent_p = strong_tag.find_parent("p")
|
||
# 如果 strong 在正文前两段内,就删除
|
||
if parent_p in content_tag.find_all("p")[:2]:
|
||
strong_tag.decompose()
|
||
|
||
# 移除h1、h2、h3标题元素中的重复标题
|
||
for heading in content_tag.find_all(["h1", "h2", "h3"]):
|
||
heading_text = heading.get_text(strip=True)
|
||
if heading_text and (title_text in heading_text or heading_text in title_text):
|
||
# 确保不删除title_tag本身
|
||
if heading != title_tag:
|
||
heading.decompose()
|
||
|
||
# 移除class包含title的元素
|
||
for title_element in content_tag.find_all(class_=lambda x: x and "title" in x):
|
||
title_element_text = title_element.get_text(strip=True)
|
||
if title_element_text and (
|
||
title_text in title_element_text or title_element_text in title_text):
|
||
# 确保不删除title_tag本身
|
||
if title_element != title_tag:
|
||
title_element.decompose()
|
||
|
||
# 移除分享相关元素
|
||
for share_element in content_tag.find_all("div", class_="sharebox"):
|
||
share_element.decompose()
|
||
|
||
# 移除二维码相关元素
|
||
for qr_element in content_tag.find_all("div", class_="xl_ewm"):
|
||
qr_element.decompose()
|
||
|
||
# 移除编辑信息
|
||
for editor_element in content_tag.find_all("div", class_="fs-text"):
|
||
text = editor_element.get_text(strip=True)
|
||
if text and ("网站编辑" in text or "审核" in text):
|
||
editor_element.decompose()
|
||
|
||
# 移除声明链接
|
||
for declare_element in content_tag.find_all("a", href=lambda x: x and "qssyggw" in x):
|
||
declare_element.decompose()
|
||
|
||
# 移除clearfix等无关div
|
||
for clear_element in content_tag.find_all("div", class_="clear"):
|
||
clear_element.decompose()
|
||
|
||
# 移除分隔线
|
||
for line_element in content_tag.find_all("div", class_="fs-line"):
|
||
line_element.decompose()
|
||
for line_element in content_tag.find_all("div", class_="fs-line_b"):
|
||
line_element.decompose()
|
||
|
||
# unwrap 剪贴板相关元素(保留文字,去掉外层标签)
|
||
for clipboard_element in content_tag.find_all("div", class_="clipboard_text"):
|
||
clipboard_element.unwrap()
|
||
|
||
# unwrap highlight 包装层(保留文字)
|
||
for highlight_element in content_tag.find_all("div", class_="highlight"):
|
||
highlight_element.unwrap()
|
||
elif "解放军报" in website.name or "81.cn" in website.name:
|
||
# 解放军报的文章结构处理 - 修复有视频的文章不被爬取问题
|
||
title_tag = (
|
||
soup.find("h1", class_="title") or
|
||
soup.find("h1", class_="article-title") or # 解放军报特有标题类
|
||
soup.find("h1") or
|
||
soup.find("h2") or # 解放军报使用h2标签作为标题
|
||
soup.find("title")
|
||
)
|
||
|
||
# 针对解放军报的特殊处理,如果标题为空或太短,尝试从title标签提取
|
||
if title_tag:
|
||
title_text = title_tag.get_text(strip=True)
|
||
if not title_text or len(title_text) < 5:
|
||
title_tag = soup.find("title")
|
||
content_tag = (
|
||
soup.find("div", id="article-content") or # 解放军报实际文章内容容器
|
||
soup.find("div", class_="text") or
|
||
soup.find("div", class_="article-content") or
|
||
soup.find("div", class_="content") or
|
||
soup.find("div", id="content") or
|
||
soup.find("div", class_="main-content") or
|
||
soup.find("div", class_="article") or
|
||
soup.find("div", class_="article-body") or
|
||
soup.find("div", class_="artichle-info") # 作为备选
|
||
)
|
||
|
||
# 针对解放军报的特殊处理,清理内容中的无关元素
|
||
if content_tag:
|
||
# 移除面包屑导航
|
||
for breadcrumb in content_tag.find_all("ol", class_="breadcrumb"):
|
||
breadcrumb.decompose()
|
||
|
||
# 移除分享相关元素
|
||
for share_element in content_tag.find_all("div", class_="share-custom"):
|
||
share_element.decompose()
|
||
|
||
# 移除作者信息段落
|
||
for author_p in content_tag.find_all("p"):
|
||
text = author_p.get_text(strip=True)
|
||
if "来源:" in text or "作者:" in text or "责任编辑:" in text or "发布:" in text:
|
||
author_p.decompose()
|
||
|
||
# 移除进度条
|
||
for progress in content_tag.find_all("div", class_="progress-bar"):
|
||
progress.decompose()
|
||
|
||
# 移除播放器
|
||
for player in content_tag.find_all("div", class_="player"):
|
||
player.decompose()
|
||
|
||
# 移除媒体URL容器
|
||
for media in content_tag.find_all("div", id="mediaurl"):
|
||
media.decompose()
|
||
|
||
# 移除新闻列表(但保留其中的内容)
|
||
for news_list in content_tag.find_all("ul", id="main-news-list"):
|
||
# 不删除整个ul,而是unwrap它,保留其中的内容
|
||
news_list.unwrap()
|
||
|
||
# 移除编辑信息
|
||
for editor_element in content_tag.find_all("div", class_="editor"):
|
||
editor_element.decompose()
|
||
|
||
# 移除分享相关元素
|
||
for share_element in content_tag.find_all("div", class_="share"):
|
||
share_element.decompose()
|
||
|
||
# 移除相关推荐
|
||
for recommend_element in content_tag.find_all("div", class_="related"):
|
||
recommend_element.decompose()
|
||
|
||
# 移除script标签
|
||
for script_element in content_tag.find_all("script"):
|
||
script_element.decompose()
|
||
|
||
# 移除样式标签
|
||
for style_element in content_tag.find_all("style"):
|
||
style_element.decompose()
|
||
elif "光明日报" in website.name or "gmw.cn" in website.name:
|
||
# 光明日报的文章结构处理 - 修复不保存文章内容问题
|
||
title_tag = (
|
||
soup.find("h1", class_="title") or
|
||
soup.find("h1") or
|
||
soup.find("title")
|
||
)
|
||
content_tag = (
|
||
soup.find("div", class_="content") or
|
||
soup.find("div", class_="article-content") or
|
||
soup.find("div", id="content") or
|
||
soup.find("div", class_="text") or
|
||
soup.find("div", class_="main-content") or
|
||
soup.find("div", class_="article") or
|
||
soup.find("div", class_="article-body")
|
||
)
|
||
elif "经济日报" in website.name or "ce.cn" in website.name:
|
||
# 经济日报的文章结构处理
|
||
title_tag = (
|
||
soup.find("h1", class_="title") or
|
||
soup.find("h1") or
|
||
soup.find("title")
|
||
)
|
||
content_tag = (
|
||
soup.find("div", class_="content") or
|
||
soup.find("div", class_="article-content") or
|
||
soup.find("div", id="content") or
|
||
soup.find("div", class_="text") or
|
||
soup.find("div", class_="main-content") or
|
||
soup.find("div", class_="article")
|
||
)
|
||
elif "中国日报" in website.name or "chinadaily" in website.name:
|
||
# 中国日报的文章结构处理 - 修复不保存文章内容问题
|
||
title_tag = (
|
||
soup.find("h1", class_="title") or
|
||
soup.find("h1") or
|
||
soup.find("title")
|
||
)
|
||
content_tag = (
|
||
soup.find("div", id="Content") or # 中国日报特有内容容器
|
||
soup.find("div", class_="content") or
|
||
soup.find("div", class_="article-content") or
|
||
soup.find("div", id="content") or
|
||
soup.find("div", class_="text") or
|
||
soup.find("div", class_="main-content") or
|
||
soup.find("div", class_="article") or
|
||
soup.find("div", class_="article-body")
|
||
)
|
||
|
||
# 处理中国日报的分页内容
|
||
if content_tag and ("中国日报" in website.name or "chinadaily" in website.name):
|
||
# 查找分页链接
|
||
page_links = []
|
||
current_page_elem = soup.find("div", id="div_currpage")
|
||
if current_page_elem:
|
||
# 查找所有分页链接
|
||
page_links = [a for a in current_page_elem.find_all("a", href=True) if not a.find("img")]
|
||
|
||
# 如果有分页,收集所有页面内容
|
||
if page_links:
|
||
print(f"发现分页内容,共 {len(page_links)} 页需要处理")
|
||
# 收集所有页面的内容
|
||
all_content_html = str(content_tag)
|
||
|
||
# 处理每个分页链接
|
||
for page_link in page_links:
|
||
page_url = urljoin(url, page_link['href'])
|
||
if page_url != url: # 避免重复处理第一页
|
||
try:
|
||
page_resp = requests.get(page_url, headers=headers, timeout=15)
|
||
page_resp.encoding = 'utf-8'
|
||
page_soup = BeautifulSoup(page_resp.text, "html.parser")
|
||
|
||
# 提取分页内容
|
||
page_content = page_soup.find("div", id="Content")
|
||
if page_content:
|
||
all_content_html += str(page_content)
|
||
print(f"已处理分页: {page_url}")
|
||
except Exception as e:
|
||
print(f"处理分页失败 {page_url}: {e}")
|
||
|
||
# 更新content_tag为包含所有分页内容
|
||
content_tag = BeautifulSoup(all_content_html, "html.parser")
|
||
elif "工人日报" in website.name or "workercn" in website.name:
|
||
# 工人日报的文章结构处理 - 修复不保存正文和图片问题
|
||
title_tag = (
|
||
soup.find("h1", class_="title") or
|
||
soup.find("h1", class_="article-title") or # 工人日报特有标题类
|
||
soup.find("h1") or
|
||
soup.find("title")
|
||
)
|
||
|
||
# 针对工人日报的特殊处理,如果标题为空或太短,尝试从title标签提取
|
||
if title_tag:
|
||
title_text = title_tag.get_text(strip=True)
|
||
if not title_text or len(title_text) < 5:
|
||
title_tag = soup.find("title")
|
||
|
||
# 进一步处理:如果h1标题包含太多无关信息,尝试从title标签提取更简洁的标题
|
||
if title_tag and title_tag.name == 'h1':
|
||
title_text = title_tag.get_text(strip=True)
|
||
if title_text and len(title_text) > 50: # 如果h1标题太长
|
||
title_tag = soup.find("title")
|
||
content_tag = (
|
||
soup.find("div", class_="ccontent") or # 工人日报特有内容容器
|
||
soup.find("div", class_="article-content") or
|
||
soup.find("div", class_="content") or
|
||
soup.find("div", id="content") or
|
||
soup.find("div", class_="text") or
|
||
soup.find("div", class_="main-content") or
|
||
soup.find("div", class_="article") or
|
||
soup.find("div", class_="article-body")
|
||
)
|
||
|
||
# 针对工人日报的特殊处理,清理内容中的无关元素
|
||
if content_tag:
|
||
# 移除编辑信息
|
||
for editor_element in content_tag.find_all("div", class_="editor"):
|
||
editor_element.decompose()
|
||
|
||
# 移除分享相关元素
|
||
for share_element in content_tag.find_all("div", class_="share"):
|
||
share_element.decompose()
|
||
|
||
# 移除相关推荐
|
||
for recommend_element in content_tag.find_all("div", class_="related"):
|
||
recommend_element.decompose()
|
||
|
||
# 移除script标签
|
||
for script_element in content_tag.find_all("script"):
|
||
script_element.decompose()
|
||
|
||
# 移除样式标签
|
||
for style_element in content_tag.find_all("style"):
|
||
style_element.decompose()
|
||
elif "科技日报" in website.name or "stdaily" in website.name:
|
||
# 科技日报的文章结构处理 - 修复无法爬取问题
|
||
title_tag = (
|
||
soup.find("h1", class_="title") or
|
||
soup.find("h1") or
|
||
soup.find("title")
|
||
)
|
||
content_tag = (
|
||
soup.find("div", class_="content") or
|
||
soup.find("div", class_="article-content") or
|
||
soup.find("div", id="content") or
|
||
soup.find("div", class_="text") or
|
||
soup.find("div", class_="main-content") or
|
||
soup.find("div", class_="article") or
|
||
soup.find("div", class_="article-body")
|
||
)
|
||
elif "人民政协报" in website.name or "rmzxb" in website.name:
|
||
# 人民政协报的文章结构处理 - 修复爬取错误问题
|
||
title_tag = (
|
||
soup.find("h1", class_="Content_title") or # 添加人民政协网特有标题类
|
||
soup.find("h1", class_="title") or
|
||
soup.find("h1") or
|
||
soup.find("title")
|
||
)
|
||
# 特殊处理人民政协网的标题结构
|
||
if title_tag and title_tag.find("span", id="a"):
|
||
title_tag = title_tag.find("span", id="a")
|
||
elif title_tag and title_tag.get_text(strip=True) == "首页>聚焦":
|
||
# 如果标题还是"首页>聚焦",尝试从内容中提取标题
|
||
if content_tag:
|
||
first_p = content_tag.find("p")
|
||
if first_p and first_p.find("strong"):
|
||
title_text = first_p.find("strong").get_text().strip()
|
||
# 创建一个虚拟的title_tag对象
|
||
title_tag = first_p.find("strong")
|
||
|
||
content_tag = (
|
||
soup.find("div", class_="content") or
|
||
soup.find("div", class_="article-content") or
|
||
soup.find("div", id="content") or
|
||
soup.find("div", class_="text") or
|
||
soup.find("div", class_="main-content") or
|
||
soup.find("div", class_="article") or
|
||
soup.find("div", class_="article-body") or
|
||
soup.find("div", class_="text_box") # 添加人民政协网特有内容容器
|
||
)
|
||
|
||
# 针对人民政协网的特殊处理,清理内容中的无关元素
|
||
if content_tag:
|
||
# 移除编辑信息
|
||
for editor_element in content_tag.find_all("p", class_="Editor"):
|
||
editor_element.decompose()
|
||
|
||
# 移除分享相关元素
|
||
for share_element in content_tag.find_all("div", class_="share"):
|
||
share_element.decompose()
|
||
|
||
# 移除Remark元素
|
||
for remark_element in content_tag.find_all("div", class_="Remark"):
|
||
remark_element.decompose()
|
||
|
||
# 移除Paging元素
|
||
for paging_element in content_tag.find_all("div", class_="Paging"):
|
||
paging_element.decompose()
|
||
|
||
# 移除政协号客户端下载提示
|
||
for zxh_element in content_tag.find_all("div",
|
||
style=lambda x: x and "background:#F9F9F9;padding:50px" in x):
|
||
zxh_element.decompose()
|
||
|
||
# 移除版权信息
|
||
for copyright_element in content_tag.find_all("div", class_="copyright"):
|
||
copyright_element.decompose()
|
||
|
||
# 移除script标签
|
||
for script_element in content_tag.find_all("script"):
|
||
script_element.decompose()
|
||
|
||
# 移除样式标签
|
||
for style_element in content_tag.find_all("style"):
|
||
style_element.decompose()
|
||
|
||
elif "中国纪检监察报" in website.name or "jjjcb" in website.name:
|
||
# 中国纪检监察报的文章结构处理 - 修复无法爬取问题
|
||
title_tag = (
|
||
soup.find("h1", class_="title") or
|
||
soup.find("h1") or
|
||
soup.find("title")
|
||
)
|
||
|
||
# 针对中国纪检监察报的特殊处理,确保标题被正确提取
|
||
if not title_tag or not title_tag.get_text(strip=True):
|
||
title_tag = soup.find("title")
|
||
|
||
content_tag = (
|
||
soup.find("div", class_="content") or
|
||
soup.find("div", class_="article-content") or
|
||
soup.find("div", id="content") or
|
||
soup.find("div", class_="text") or
|
||
soup.find("div", class_="main-content") or
|
||
soup.find("div", class_="article") or
|
||
soup.find("div", class_="article-body") or
|
||
soup.find("div", class_="main") or
|
||
soup.find("div", class_="detail") or
|
||
soup.find("div", class_="article_yt") # 中国纪检监察报特有内容容器
|
||
)
|
||
elif "中国新闻社" in website.name or "chinanews" in website.name:
|
||
# 中国新闻社的文章结构处理 - 修复爬取非文章部分问题
|
||
title_tag = (
|
||
soup.find("h1", class_="title") or
|
||
soup.find("h1") or
|
||
soup.find("title")
|
||
)
|
||
# 修改内容选择器,更精确地定位文章正文区域
|
||
content_tag = (
|
||
soup.find("div", class_="left_zw") or # 中国新闻网文章正文区域
|
||
soup.find("div", class_="content") or
|
||
soup.find("div", class_="article-content") or
|
||
soup.find("div", id="content") or
|
||
soup.find("div", class_="text") or
|
||
soup.find("div", class_="main-content") or
|
||
soup.find("div", class_="article") or
|
||
soup.find("div", class_="article-body")
|
||
)
|
||
|
||
elif "学习时报" in website.name or "studytimes" in website.name:
|
||
# 学习时报的文章结构处理 - 修复不保存文章内容问题
|
||
title_tag = (
|
||
soup.find("h1", class_="title") or
|
||
soup.find("h1") or
|
||
soup.find("title")
|
||
)
|
||
content_tag = (
|
||
soup.find("div", id="detail") or # 添加学习时报特有内容容器
|
||
soup.find("div", class_="content") or
|
||
soup.find("div", class_="article-content") or
|
||
soup.find("div", id="content") or
|
||
soup.find("div", class_="text") or
|
||
soup.find("div", class_="main-content") or
|
||
soup.find("div", class_="article") or
|
||
soup.find("div", class_="article-body")
|
||
)
|
||
|
||
# 针对学习时报的特殊处理,清理内容中的无关元素
|
||
if content_tag:
|
||
# 移除编辑信息
|
||
for editor_element in content_tag.find_all("div", class_="editor"):
|
||
editor_element.decompose()
|
||
|
||
# 移除分享相关元素
|
||
for share_element in content_tag.find_all("div", class_="share"):
|
||
share_element.decompose()
|
||
|
||
# 移除无关的TRS_Editor包装层
|
||
for trs_editor in content_tag.find_all("div", class_="TRS_Editor"):
|
||
trs_editor.unwrap() # unwrap只移除标签,保留内容
|
||
|
||
# 移除Custom_UnionStyle包装层
|
||
for custom_style in content_tag.find_all("div", class_="Custom_UnionStyle"):
|
||
custom_style.unwrap() # unwrap只移除标签,保留内容
|
||
elif "中国青年报" in website.name or "cyol" in website.name:
|
||
# 中国青年报的文章结构处理 - 修复无法爬取问题
|
||
title_tag = (
|
||
soup.find("h1", class_="title") or
|
||
soup.find("h1") or
|
||
soup.find("title")
|
||
)
|
||
|
||
# 针对中国青年报的特殊处理,确保标题被正确提取
|
||
if not title_tag or not title_tag.get_text(strip=True):
|
||
title_tag = soup.find("title")
|
||
|
||
content_tag = (
|
||
soup.find("div", class_="main") or # 中国青年报特有内容容器
|
||
soup.find("div", class_="content") or
|
||
soup.find("div", class_="article-content") or
|
||
soup.find("div", id="content") or
|
||
soup.find("div", class_="text") or
|
||
soup.find("div", class_="main-content") or
|
||
soup.find("div", class_="article") or
|
||
soup.find("div", class_="article-body")
|
||
)
|
||
elif "中国妇女报" in website.name or "cnwomen" in website.name:
|
||
# 中国妇女报的文章结构处理 - 修复不保存文章内容问题
|
||
title_tag = (
|
||
soup.find("h1", class_="title") or
|
||
soup.find("h1") or
|
||
soup.find("title")
|
||
)
|
||
|
||
# 针对中国妇女报的特殊处理,确保标题被正确提取
|
||
if not title_tag or not title_tag.get_text(strip=True):
|
||
title_tag = soup.find("title")
|
||
|
||
content_tag = (
|
||
soup.find("div", class_="main") or # 中国妇女报特有内容容器
|
||
soup.find("div", class_="news") or # 中国妇女报特有内容容器
|
||
soup.find("div", class_="content") or
|
||
soup.find("div", class_="article-content") or
|
||
soup.find("div", id="content") or
|
||
soup.find("div", class_="text") or
|
||
soup.find("div", class_="main-content") or
|
||
soup.find("div", class_="article") or
|
||
soup.find("div", class_="article-body")
|
||
)
|
||
elif "法治日报" in website.name or "legaldaily" in website.name:
|
||
# 法治日报的文章结构处理 - 修复不保存正文和图片问题
|
||
title_tag = (
|
||
soup.find("h1", class_="title") or
|
||
soup.find("h1", class_="article-title") or # 法治日报特有标题类
|
||
soup.find("h1") or
|
||
soup.find("title")
|
||
)
|
||
|
||
# 针对法治日报的特殊处理,确保标题被正确提取
|
||
if not title_tag or not title_tag.get_text(strip=True):
|
||
title_tag = soup.find("title")
|
||
|
||
content_tag = (
|
||
soup.find("div", class_="content-two") or # 优先查找content-two类
|
||
soup.find("div", class_="article-content") or # 法治日报特有内容容器
|
||
soup.find("div", class_="content") or
|
||
soup.find("div", id="content") or
|
||
soup.find("div", class_="text") or
|
||
soup.find("div", class_="main-content") or
|
||
soup.find("div", class_="article") or
|
||
soup.find("div", class_="article-body") or
|
||
soup.find("div", class_="article-detail") or # 法治日报特有内容容器
|
||
soup.find("div", class_="detail-content") or # 法治日报特有内容容器
|
||
soup.find("div", class_="article-text") # 法治日报特有内容容器
|
||
)
|
||
|
||
# 针对法治日报的特殊处理,清理内容中的无关元素
|
||
if content_tag:
|
||
# 如果找到content-two,需要进一步处理去除内部的标题元素(避免重复)
|
||
if content_tag.get('class') and 'content-two' in content_tag.get('class', []):
|
||
# 查找并移除内容中的标题元素(避免重复)
|
||
inner_titles = content_tag.find_all(['h1', 'h2'])
|
||
title_text = title_tag.get_text(strip=True) if title_tag else "无标题"
|
||
for inner_title in inner_titles:
|
||
if inner_title.get_text().strip() == title_text:
|
||
inner_title.decompose()
|
||
|
||
# 移除编辑信息
|
||
for editor_element in content_tag.find_all("div", class_="editor"):
|
||
editor_element.decompose()
|
||
|
||
# 移除分享相关元素
|
||
for share_element in content_tag.find_all("div", class_="share"):
|
||
share_element.decompose()
|
||
|
||
# 移除相关推荐
|
||
for recommend_element in content_tag.find_all("div", class_="related"):
|
||
recommend_element.decompose()
|
||
|
||
# 移除script标签
|
||
for script_element in content_tag.find_all("script"):
|
||
script_element.decompose()
|
||
|
||
# 移除样式标签
|
||
for style_element in content_tag.find_all("style"):
|
||
style_element.decompose()
|
||
elif "农民日报" in website.name or "farmer" in website.name:
|
||
# 农民日报的文章结构处理 - 修复不保存正文和图片问题
|
||
title_tag = (
|
||
soup.find("h1", class_="title") or
|
||
soup.find("h1", class_="article-title") or # 农民日报特有标题类
|
||
soup.find("h1") or
|
||
soup.find("title")
|
||
)
|
||
|
||
# 针对农民日报的特殊处理,如果标题出现乱码,尝试从title标签提取
|
||
if title_tag and title_tag.name == 'h1':
|
||
title_text = title_tag.get_text(strip=True)
|
||
if title_text and any(char in title_text for char in ['', '', '']):
|
||
title_tag = soup.find("title")
|
||
|
||
# 针对农民日报的特殊处理,确保标题被正确提取
|
||
if not title_tag or not title_tag.get_text(strip=True):
|
||
title_tag = soup.find("title")
|
||
content_tag = (
|
||
soup.find("div", class_="detailCon") or # 农民日报特有内容容器
|
||
soup.find("div", class_="article-content") or
|
||
soup.find("div", class_="content") or
|
||
soup.find("div", id="content") or
|
||
soup.find("div", class_="text") or
|
||
soup.find("div", class_="main-content") or
|
||
soup.find("div", class_="article") or
|
||
soup.find("div", class_="article-body")
|
||
)
|
||
|
||
# 针对农民日报的特殊处理,如果找到多个detailCon,选择内容最长的那个
|
||
if content_tag and content_tag.get('class') and 'detailCon' in content_tag.get('class', []):
|
||
detail_cons = soup.find_all("div", class_="detailCon")
|
||
if len(detail_cons) > 1:
|
||
# 选择内容最长的detailCon
|
||
longest_content = max(detail_cons, key=lambda x: len(x.get_text(strip=True)))
|
||
if len(longest_content.get_text(strip=True)) > len(content_tag.get_text(strip=True)):
|
||
content_tag = longest_content
|
||
|
||
# 针对农民日报的特殊处理,清理内容中的无关元素
|
||
if content_tag:
|
||
# 移除编辑信息
|
||
for editor_element in content_tag.find_all("div", class_="editor"):
|
||
editor_element.decompose()
|
||
|
||
# 移除分享相关元素
|
||
for share_element in content_tag.find_all("div", class_="share"):
|
||
share_element.decompose()
|
||
|
||
# 移除相关推荐
|
||
for recommend_element in content_tag.find_all("div", class_="related"):
|
||
recommend_element.decompose()
|
||
|
||
# 移除script标签
|
||
for script_element in content_tag.find_all("script"):
|
||
script_element.decompose()
|
||
|
||
# 移除样式标签
|
||
for style_element in content_tag.find_all("style"):
|
||
style_element.decompose()
|
||
elif "学习强国" in website.name or "xuexi" in website.name:
|
||
# 学习强国的文章结构处理 - 修复无法爬取问题
|
||
title_tag = (
|
||
soup.find("h1", class_="title") or
|
||
soup.find("h1") or
|
||
soup.find("title")
|
||
)
|
||
|
||
# 针对学习强国的特殊处理,确保标题被正确提取
|
||
if not title_tag or not title_tag.get_text(strip=True):
|
||
title_tag = soup.find("title")
|
||
|
||
# 针对学习强国的特殊处理,如果标题太短,尝试从title标签提取
|
||
if title_tag:
|
||
title_text = title_tag.get_text(strip=True)
|
||
if title_text and len(title_text) < 10:
|
||
title_tag = soup.find("title")
|
||
|
||
content_tag = (
|
||
soup.find("div", class_="content") or
|
||
soup.find("div", class_="article-content") or
|
||
soup.find("div", id="content") or
|
||
soup.find("div", class_="text") or
|
||
soup.find("div", class_="main-content") or
|
||
soup.find("div", class_="article") or
|
||
soup.find("div", class_="article-body") or
|
||
soup.find("div", class_="main") or
|
||
soup.find("div", class_="detail") or
|
||
soup.find("div", class_="lgpage-detail") or # 学习强国特有内容容器
|
||
soup.find("div", class_="detail-content") or # 学习强国特有内容容器
|
||
soup.find("div", class_="article-detail") or # 学习强国特有内容容器
|
||
soup.find("div", class_="xuexi") or # 学习强国特有内容容器
|
||
soup.find("div", class_="kNews") # 学习强国特有内容容器
|
||
)
|
||
elif "旗帜网" in website.name or "qizhiwang" in website.name:
|
||
# 旗帜网的文章结构处理 - 修复不保存正文和图片问题
|
||
title_tag = (
|
||
soup.find("div", class_="w1200 flag-text-tit clearfix") and
|
||
soup.find("div", class_="w1200 flag-text-tit clearfix").find("h1") or
|
||
soup.find("h1", class_="title") or
|
||
soup.find("h1", class_="article-title") or # 旗帜网特有标题类
|
||
soup.find("h1") or
|
||
soup.find("title")
|
||
)
|
||
|
||
# 针对旗帜网的特殊处理,如果标题为空或太短,尝试从title标签提取
|
||
if title_tag:
|
||
title_text = title_tag.get_text(strip=True)
|
||
if not title_text or len(title_text) < 5:
|
||
title_tag = soup.find("title")
|
||
|
||
# 针对旗帜网的特殊处理,确保标题被正确提取
|
||
if not title_tag or not title_tag.get_text(strip=True):
|
||
title_tag = soup.find("title")
|
||
|
||
# 针对旗帜网的特殊处理,如果标题太短,尝试从title标签提取
|
||
if title_tag:
|
||
title_text = title_tag.get_text(strip=True)
|
||
if title_text and len(title_text) < 10:
|
||
title_tag = soup.find("title")
|
||
content_tag = (
|
||
soup.find("div", class_="w1200 flag-text-con clearfix") or # 旗帜网特有内容容器
|
||
soup.find("div", class_="article-content") or # 旗帜网特有内容容器
|
||
soup.find("div", class_="content") or
|
||
soup.find("div", id="content") or
|
||
soup.find("div", class_="text") or
|
||
soup.find("div", class_="main-content") or
|
||
soup.find("div", class_="article") or
|
||
soup.find("div", class_="article-body")
|
||
)
|
||
|
||
# 针对旗帜网的特殊处理,清理内容中的无关元素
|
||
if content_tag:
|
||
# 移除编辑信息
|
||
for editor_element in content_tag.find_all("div", class_="editor"):
|
||
editor_element.decompose()
|
||
|
||
# 移除分享相关元素
|
||
for share_element in content_tag.find_all("div", class_="share"):
|
||
share_element.decompose()
|
||
|
||
# 移除相关推荐
|
||
for recommend_element in content_tag.find_all("div", class_="related"):
|
||
recommend_element.decompose()
|
||
|
||
# 移除script标签
|
||
for script_element in content_tag.find_all("script"):
|
||
script_element.decompose()
|
||
|
||
# 移除样式标签
|
||
for style_element in content_tag.find_all("style"):
|
||
style_element.decompose()
|
||
|
||
# 针对旗帜网的特殊处理,清理内容中的无关元素
|
||
if content_tag:
|
||
# 移除编辑信息
|
||
for editor_element in content_tag.find_all("div", class_="editor"):
|
||
editor_element.decompose()
|
||
|
||
# 移除分享相关元素
|
||
for share_element in content_tag.find_all("div", class_="share"):
|
||
share_element.decompose()
|
||
|
||
# 移除相关推荐
|
||
for recommend_element in content_tag.find_all("div", class_="related"):
|
||
recommend_element.decompose()
|
||
|
||
# 移除script标签
|
||
for script_element in content_tag.find_all("script"):
|
||
script_element.decompose()
|
||
|
||
# 移除样式标签
|
||
for style_element in content_tag.find_all("style"):
|
||
style_element.decompose()
|
||
|
||
elif "中国网" in website.name or "china.com.cn" in website.name:
|
||
# 中国网的文章结构处理 - 修复不保存正文和图片问题
|
||
title_tag = (
|
||
soup.find("h1", class_="title") or
|
||
soup.find("h1", class_="article-title") or # 中国网特有标题类
|
||
soup.find("h1") or
|
||
soup.find("title")
|
||
)
|
||
|
||
# 针对中国网的特殊处理,如果标题为空或太短,尝试从title标签提取
|
||
if title_tag:
|
||
title_text = title_tag.get_text(strip=True)
|
||
if not title_text or len(title_text) < 5:
|
||
title_tag = soup.find("title")
|
||
|
||
content_tag = (
|
||
soup.find("div", class_="article") or # 中国网特有内容容器
|
||
soup.find("div", class_="main") or
|
||
soup.find("div", class_="textBox") or # 中国网直播特有内容容器
|
||
soup.find("div", class_="artInfo") or # 中国网直播特有内容容器
|
||
soup.find("div", class_="article-content") or
|
||
soup.find("div", class_="content") or
|
||
soup.find("div", id="content") or
|
||
soup.find("div", class_="text") or
|
||
soup.find("div", class_="main-content") or
|
||
soup.find("div", class_="article-body")
|
||
)
|
||
|
||
# 针对中国网的特殊处理,清理内容中的无关元素
|
||
if content_tag:
|
||
# 移除编辑信息
|
||
for editor_element in content_tag.find_all("div", class_="editor"):
|
||
editor_element.decompose()
|
||
|
||
# 移除分享相关元素
|
||
for share_element in content_tag.find_all("div", class_="share"):
|
||
share_element.decompose()
|
||
|
||
# 移除相关推荐
|
||
for recommend_element in content_tag.find_all("div", class_="related"):
|
||
recommend_element.decompose()
|
||
|
||
# 移除script标签
|
||
for script_element in content_tag.find_all("script"):
|
||
script_element.decompose()
|
||
|
||
# 移除样式标签
|
||
for style_element in content_tag.find_all("style"):
|
||
style_element.decompose()
|
||
else:
|
||
# 默认处理方式
|
||
title_tag = soup.find("h1") or soup.find("title")
|
||
content_tag = soup.find("div", class_="content") or soup.find("div", id="content")
|
||
|
||
# 最终标题处理 - 只有在没有网站特定处理时才使用默认处理
|
||
if not title_tag:
|
||
title_tag = soup.find("h1") or soup.find("title")
|
||
|
||
title = title_tag.get_text(strip=True) if title_tag else "无标题"
|
||
|
||
# 对标题进行额外处理,去除可能的多余空白字符
|
||
title = title.strip() if title else "无标题"
|
||
|
||
if not content_tag:
|
||
print("没有找到正文,跳过:", url)
|
||
return
|
||
|
||
imgs = content_tag.find_all("img")
|
||
# 查找视频元素
|
||
videos = content_tag.find_all("video")
|
||
media_files = []
|
||
|
||
safe_title = "".join(c if c.isalnum() else "_" for c in title)[:50]
|
||
save_dir = os.path.join(settings.MEDIA_ROOT, "articles", safe_title)
|
||
os.makedirs(save_dir, exist_ok=True)
|
||
|
||
for img in imgs:
|
||
src = img.get("src")
|
||
if not src:
|
||
continue
|
||
if not src.startswith("http"):
|
||
src = urljoin(url, src)
|
||
local_path = download_media(src, save_dir)
|
||
if local_path:
|
||
rel_path = os.path.relpath(local_path, settings.MEDIA_ROOT)
|
||
img["src"] = settings.MEDIA_URL + rel_path.replace("\\", "/")
|
||
media_files.append(rel_path.replace("\\", "/"))
|
||
|
||
# 处理视频文件
|
||
for video in videos:
|
||
src = video.get("src")
|
||
if not src:
|
||
# 检查<source>标签
|
||
source = video.find("source")
|
||
if source:
|
||
src = source.get("src")
|
||
|
||
# 检查data-src属性(央视网等网站常用)
|
||
if not src:
|
||
src = video.get("data-src")
|
||
|
||
# 检查其他可能的视频源属性
|
||
if not src:
|
||
src = video.get("data-url") or video.get("data-video")
|
||
|
||
# 新增:检查新华网特有的视频源属性
|
||
if not src:
|
||
src = video.get("data-video-src")
|
||
|
||
# 新增:针对新华网的特殊处理,从复杂播放器结构中提取视频源
|
||
if not src and "新华网" in website.name:
|
||
# 尝试从video标签的属性中直接获取src
|
||
for attr in video.attrs:
|
||
if 'src' in attr.lower():
|
||
src = video.attrs.get(attr)
|
||
break
|
||
|
||
# 如果还是没有找到,尝试查找父容器中的视频源信息
|
||
if not src:
|
||
parent = video.parent
|
||
if parent and parent.name == 'div' and 'player-container' in parent.get('class', []):
|
||
# 检查是否有data-*属性包含视频信息
|
||
for attr, value in parent.attrs.items():
|
||
if 'data' in attr and isinstance(value, str) and ('.mp4' in value or 'video' in value):
|
||
src = value
|
||
break
|
||
|
||
if not src:
|
||
continue
|
||
|
||
if not src.startswith("http"):
|
||
src = urljoin(url, src)
|
||
|
||
# 针对央视网等特殊处理
|
||
if "央视" in website.name or "CCTV" in website.name or "cctv" in website.name:
|
||
# 央视网视频可能需要特殊处理
|
||
if "cctv.com" in src or "cntv.cn" in src:
|
||
print(f"发现央视视频: {src}")
|
||
|
||
# 针对新华网的特殊处理
|
||
elif "新华网" in website.name:
|
||
print(f"发现新华网视频: {src}")
|
||
|
||
# 针对解放军报的特殊处理
|
||
elif "解放军报" in website.name or "81.cn" in website.name:
|
||
print(f"发现解放军报视频: {src}")
|
||
# 解放军报视频可能需要特殊处理
|
||
if "81.cn" in src:
|
||
print(f"处理解放军报视频: {src}")
|
||
|
||
local_path = download_media(src, save_dir)
|
||
if local_path:
|
||
rel_path = os.path.relpath(local_path, settings.MEDIA_ROOT)
|
||
# 更新视频src属性
|
||
if video.get("src"):
|
||
video["src"] = settings.MEDIA_URL + rel_path.replace("\\", "/")
|
||
else:
|
||
source = video.find("source")
|
||
if source:
|
||
source["src"] = settings.MEDIA_URL + rel_path.replace("\\", "/")
|
||
media_files.append(rel_path.replace("\\", "/"))
|
||
print(f"视频下载成功: {src}")
|
||
else:
|
||
print(f"视频下载失败: {src}")
|
||
|
||
content_html = str(content_tag)
|
||
|
||
try:
|
||
# 使用try-except处理可能的数据库约束错误
|
||
article = Article.objects.create(
|
||
website=website,
|
||
title=title,
|
||
url=url,
|
||
content=content_html,
|
||
pub_date=timezone.now(),
|
||
media_files=media_files
|
||
)
|
||
print(f"已保存文章及图片:{title}")
|
||
except Exception as e:
|
||
# 处理重复URL或其他数据库错误
|
||
if "UNIQUE constraint failed" in str(e) and "core_article.url" in str(e):
|
||
print(f"文章URL重复,跳过保存: {url}")
|
||
else:
|
||
print(f"保存文章时出错: {url},错误:{e}")
|
||
|
||
|
||
def is_valid_url(url, base_netloc):
|
||
try:
|
||
parsed = urlparse(url)
|
||
if parsed.scheme not in ("http", "https"):
|
||
return False
|
||
if parsed.netloc != base_netloc:
|
||
return False
|
||
return True
|
||
except Exception:
|
||
return False
|
||
|
||
|
||
def full_site_crawler(start_url, website, max_pages=1000):
|
||
headers = {"User-Agent": "Mozilla/5.0"}
|
||
visited = set()
|
||
queue = deque([start_url])
|
||
|
||
base_netloc = urlparse(start_url).netloc
|
||
|
||
pages_crawled = 0
|
||
|
||
while queue and pages_crawled < max_pages:
|
||
url = queue.popleft()
|
||
if url in visited:
|
||
continue
|
||
print(f"正在爬取:{url}")
|
||
visited.add(url)
|
||
|
||
try:
|
||
resp = requests.get(url, headers=headers, timeout=15)
|
||
resp.raise_for_status()
|
||
except Exception as e:
|
||
print(f"请求失败:{url},错误:{e}")
|
||
continue
|
||
|
||
resp.encoding = 'utf-8'
|
||
soup = BeautifulSoup(resp.text, "html.parser")
|
||
|
||
# 根据不同网站判断文章页面
|
||
is_article_page = False
|
||
if website.name == "新华网" or website.name == "www.news.cn" or "新华网" in website.name:
|
||
# 新华网的文章页面判断逻辑 - 修复不保存文章内容问题
|
||
parsed_url = urlparse(url)
|
||
path = parsed_url.path
|
||
is_article_page = (
|
||
soup.find("div", class_="article-content") is not None or
|
||
soup.find("div", class_="content") is not None or
|
||
soup.find("div", id="content") is not None or
|
||
soup.find("div", class_="article") is not None or
|
||
soup.find("div", class_="main-content") is not None or
|
||
soup.find("span", id="detailContent") is not None or # 添加新华网特有内容容器判断
|
||
("/news/" in path) or
|
||
("/article/" in path) or
|
||
(path.startswith("/detail/") and len(path) > 10)
|
||
)
|
||
elif website.name == "东方烟草报":
|
||
# 对于东方烟草报,我们增加基于URL模式的判断
|
||
# 东方烟草报的文章URL通常包含/content/和日期格式
|
||
parsed_url = urlparse(url)
|
||
path = parsed_url.path
|
||
is_article_page = (
|
||
soup.find("div", class_="content") is not None or
|
||
soup.find("div", id="gallery") is not None or
|
||
soup.find("div", id="ContentText") is not None or
|
||
("/content/" in path and len(path) > 20)
|
||
)
|
||
elif website.name == "www.gov.cn":
|
||
# 中国政府网的文章页面判断逻辑 - 修复两个标题问题
|
||
parsed_url = urlparse(url)
|
||
path = parsed_url.path
|
||
is_article_page = (
|
||
soup.find("div", class_="pages_content") is not None or
|
||
soup.find("div", class_="article_con") is not None or
|
||
soup.find("div", class_="content") is not None or
|
||
soup.find("div", id="content") is not None or
|
||
soup.find("div", class_="mainBody") is not None or
|
||
("/zhengce/" in path) or
|
||
("/xinwen/" in path) or
|
||
("/huoban/" in path)
|
||
)
|
||
elif "人民日报" in website.name or "人民网" in website.name:
|
||
# 人民日报的文章页面判断逻辑 - 修复乱码和404问题
|
||
parsed_url = urlparse(url)
|
||
path = parsed_url.path
|
||
# 修改: 增加更准确的文章页面判断逻辑
|
||
is_article_page = (
|
||
(soup.find("div", class_="content") is not None and
|
||
soup.find("h1") is not None) or
|
||
soup.find("div", class_="article-content") is not None or
|
||
(soup.find("div", id="content") is not None and
|
||
soup.find("h1") is not None) or
|
||
soup.find("div", class_="text") is not None or
|
||
soup.find("section", class_="content") is not None or
|
||
soup.find("div", class_="article") is not None or
|
||
("/article/" in path) or
|
||
(path.startswith("/detail/") and len(path) > 10) or
|
||
# 增加对peopleapp.com特定文章路径的判断
|
||
("/dynamic/" in path and "article" in path)
|
||
)
|
||
elif "央视" in website.name or "CCTV" in website.name or "cctv" in website.name:
|
||
# 央视网站的文章页面判断逻辑 - 修复不保存正文和图片问题
|
||
parsed_url = urlparse(url)
|
||
path = parsed_url.path
|
||
is_article_page = (
|
||
(soup.find("div", class_="content_area") is not None and
|
||
soup.find("h1") is not None) or # 央视网特有内容容器
|
||
(soup.find("div", class_="content") is not None and
|
||
soup.find("h1") is not None) or
|
||
soup.find("div", class_="article-content") is not None or
|
||
(soup.find("div", id="content") is not None and
|
||
soup.find("h1") is not None) or
|
||
soup.find("div", class_="text") is not None or
|
||
soup.find("div", class_="main-content") is not None or
|
||
soup.find("div", class_="article") is not None or
|
||
soup.find("div", class_="article-body") is not None or
|
||
("/news/" in path) or
|
||
("/article/" in path) or
|
||
(path.startswith("/detail/") and len(path) > 10) or
|
||
("ARTI" in path) # 央视网文章URL特征
|
||
)
|
||
elif "求是" in website.name:
|
||
# 求是网站的文章页面判断逻辑 - 修复两个标题问题
|
||
parsed_url = urlparse(url)
|
||
path = parsed_url.path
|
||
is_article_page = (
|
||
soup.find("div", class_="text") is not None or
|
||
soup.find("div", class_="main-content") is not None or
|
||
soup.find("div", class_="article") is not None or
|
||
("/article/" in path) or
|
||
("/content/" in path) or
|
||
(path.startswith("/detail/") and len(path) > 10)
|
||
)
|
||
elif "解放军报" in website.name or "81.cn" in website.name:
|
||
# 解放军报的文章页面判断逻辑 - 修复有视频的文章不被爬取问题
|
||
parsed_url = urlparse(url)
|
||
path = parsed_url.path
|
||
is_article_page = (
|
||
(soup.find("div", class_="artichle-info") is not None and
|
||
soup.find("title") is not None) or # 解放军报特有内容容器
|
||
(soup.find("div", class_="article-content") is not None and
|
||
soup.find("h1") is not None) or
|
||
(soup.find("div", class_="content") is not None and
|
||
soup.find("h1") is not None) or
|
||
(soup.find("div", id="content") is not None and
|
||
soup.find("h1") is not None) or
|
||
soup.find("div", class_="text") is not None or
|
||
soup.find("div", class_="main-content") is not None or
|
||
soup.find("div", class_="article") is not None or
|
||
soup.find("div", class_="article-body") is not None or
|
||
("/zt/" in path) or # 解放军报专题栏目
|
||
("/article/" in path) or
|
||
("/content/" in path) or
|
||
(path.startswith("/detail/") and len(path) > 10)
|
||
)
|
||
|
||
# 排除列表页面
|
||
if "/index.html" in path or path.endswith("/"):
|
||
is_article_page = False
|
||
elif "光明日报" in website.name or "gmw.cn" in website.name:
|
||
# 光明日报的文章页面判断逻辑 - 修复不保存文章内容问题
|
||
parsed_url = urlparse(url)
|
||
path = parsed_url.path
|
||
is_article_page = (
|
||
(soup.find("div", class_="content") is not None and
|
||
soup.find("h1") is not None) or
|
||
soup.find("div", class_="article-content") is not None or
|
||
(soup.find("div", id="content") is not None and
|
||
soup.find("h1") is not None) or
|
||
soup.find("div", class_="text") is not None or
|
||
soup.find("div", class_="main-content") is not None or
|
||
soup.find("div", class_="article") is not None or
|
||
soup.find("div", class_="article-body") is not None or
|
||
("/article/" in path) or
|
||
("/content/" in path) or
|
||
(path.startswith("/detail/") and len(path) > 10)
|
||
)
|
||
elif "经济日报" in website.name or "ce.cn" in website.name:
|
||
# 经济日报的文章页面判断逻辑
|
||
parsed_url = urlparse(url)
|
||
path = parsed_url.path
|
||
is_article_page = (
|
||
(soup.find("div", class_="content") is not None and
|
||
soup.find("h1") is not None) or
|
||
soup.find("div", class_="article-content") is not None or
|
||
(soup.find("div", id="content") is not None and
|
||
soup.find("h1") is not None) or
|
||
soup.find("div", class_="text") is not None or
|
||
soup.find("div", class_="main-content") is not None or
|
||
soup.find("div", class_="article") is not None or
|
||
("/article/" in path) or
|
||
("/content/" in path) or
|
||
(path.startswith("/detail/") and len(path) > 10)
|
||
)
|
||
elif "中国日报" in website.name or "chinadaily" in website.name:
|
||
# 中国日报的文章页面判断逻辑 - 修复不保存文章内容问题
|
||
parsed_url = urlparse(url)
|
||
path = parsed_url.path
|
||
is_article_page = (
|
||
(soup.find("div", class_="content") is not None and
|
||
soup.find("h1") is not None) or
|
||
soup.find("div", class_="article-content") is not None or
|
||
(soup.find("div", id="content") is not None and
|
||
soup.find("h1") is not None) or
|
||
(soup.find("div", id="Content") is not None and # 中国日报特有内容容器
|
||
soup.find("h1") is not None) or
|
||
soup.find("div", class_="text") is not None or
|
||
soup.find("div", class_="main-content") is not None or
|
||
soup.find("div", class_="article") is not None or
|
||
soup.find("div", class_="article-body") is not None or
|
||
("/article/" in path) or
|
||
("/content/" in path) or
|
||
(path.startswith("/detail/") and len(path) > 10)
|
||
)
|
||
elif "工人日报" in website.name or "workercn" in website.name:
|
||
# 工人日报的文章页面判断逻辑 - 修复不保存正文和图片问题
|
||
parsed_url = urlparse(url)
|
||
path = parsed_url.path
|
||
is_article_page = (
|
||
(soup.find("div", class_="ccontent") is not None and
|
||
soup.find("h1") is not None) or # 工人日报特有内容容器
|
||
(soup.find("div", class_="article-content") is not None and
|
||
soup.find("h1") is not None) or
|
||
(soup.find("div", class_="content") is not None and
|
||
soup.find("h1") is not None) or
|
||
(soup.find("div", id="content") is not None and
|
||
soup.find("h1") is not None) or
|
||
soup.find("div", class_="text") is not None or
|
||
soup.find("div", class_="main-content") is not None or
|
||
soup.find("div", class_="article") is not None or
|
||
soup.find("div", class_="article-body") is not None or
|
||
("/c/" in path) or # 工人日报文章URL特征
|
||
("/article/" in path) or
|
||
("/content/" in path) or
|
||
(path.startswith("/detail/") and len(path) > 10)
|
||
)
|
||
elif "科技日报" in website.name or "stdaily" in website.name:
|
||
# 科技日报的文章页面判断逻辑 - 修复无法爬取问题
|
||
parsed_url = urlparse(url)
|
||
path = parsed_url.path
|
||
is_article_page = (
|
||
(soup.find("div", class_="content") is not None and
|
||
soup.find("h1") is not None) or
|
||
soup.find("div", class_="article-content") is not None or
|
||
(soup.find("div", id="content") is not None and
|
||
soup.find("h1") is not None) or
|
||
soup.find("div", class_="text") is not None or
|
||
soup.find("div", class_="main-content") is not None or
|
||
soup.find("div", class_="article") is not None or
|
||
soup.find("div", class_="article-body") is not None or
|
||
("/article/" in path) or
|
||
("/content/" in path) or
|
||
(path.startswith("/detail/") and len(path) > 10)
|
||
)
|
||
elif "人民政协报" in website.name or "rmzxb" in website.name:
|
||
# 人民政协报的文章页面判断逻辑 - 修复爬取错误问题
|
||
parsed_url = urlparse(url)
|
||
path = parsed_url.path
|
||
is_article_page = (
|
||
(soup.find("div", class_="content") is not None and
|
||
soup.find("h1") is not None) or
|
||
soup.find("div", class_="article-content") is not None or
|
||
(soup.find("div", id="content") is not None and
|
||
soup.find("h1") is not None) or
|
||
soup.find("div", class_="text") is not None or
|
||
soup.find("div", class_="main-content") is not None or
|
||
soup.find("div", class_="article") is not None or
|
||
soup.find("div", class_="article-body") is not None or
|
||
("/article/" in path) or
|
||
("/content/" in path) or
|
||
(path.startswith("/detail/") and len(path) > 10)
|
||
)
|
||
elif "中国纪检监察报" in website.name or "jjjcb" in website.name:
|
||
# 中国纪检监察报的文章页面判断逻辑 - 修复无法爬取问题
|
||
parsed_url = urlparse(url)
|
||
path = parsed_url.path
|
||
is_article_page = (
|
||
(soup.find("div", class_="content") is not None and
|
||
soup.find("h1") is not None) or
|
||
soup.find("div", class_="article-content") is not None or
|
||
(soup.find("div", id="content") is not None and
|
||
soup.find("h1") is not None) or
|
||
soup.find("div", class_="text") is not None or
|
||
soup.find("div", class_="main-content") is not None or
|
||
soup.find("div", class_="article") is not None or
|
||
soup.find("div", class_="article-body") is not None or
|
||
("/article/" in path) or
|
||
("/content/" in path) or
|
||
(path.startswith("/detail/") and len(path) > 10)
|
||
)
|
||
elif "中国新闻社" in website.name or "chinanews" in website.name:
|
||
# 中国新闻社的文章页面判断逻辑 - 修复爬取非文章部分问题
|
||
parsed_url = urlparse(url)
|
||
path = parsed_url.path
|
||
is_article_page = (
|
||
(soup.find("div", class_="content") is not None and
|
||
soup.find("h1") is not None) or
|
||
soup.find("div", class_="article-content") is not None or
|
||
(soup.find("div", id="content") is not None and
|
||
soup.find("h1") is not None) or
|
||
soup.find("div", class_="text") is not None or
|
||
soup.find("div", class_="main-content") is not None or
|
||
soup.find("div", class_="article") is not None or
|
||
soup.find("div", class_="article-body") is not None or
|
||
soup.find("div", class_="left_zw") is not None or # 中国新闻网正文区域
|
||
("/article/" in path) or
|
||
("/content/" in path) or
|
||
(path.startswith("/detail/") and len(path) > 10)
|
||
)
|
||
elif "学习时报" in website.name or "studytimes" in website.name:
|
||
# 学习时报的文章页面判断逻辑 - 修复不保存文章内容问题
|
||
parsed_url = urlparse(url)
|
||
path = parsed_url.path
|
||
is_article_page = (
|
||
(soup.find("div", class_="content") is not None and
|
||
soup.find("h1") is not None) or
|
||
soup.find("div", class_="article-content") is not None or
|
||
(soup.find("div", id="content") is not None and
|
||
soup.find("h1") is not None) or
|
||
soup.find("div", class_="text") is not None or
|
||
soup.find("div", class_="main-content") is not None or
|
||
soup.find("div", class_="article") is not None or
|
||
soup.find("div", class_="article-body") is not None or
|
||
soup.find("div", id="detail") is not None or # 添加学习时报特有内容容器判断
|
||
("/article/" in path) or
|
||
("/content/" in path) or
|
||
(path.startswith("/detail/") and len(path) > 10)
|
||
)
|
||
elif "中国青年报" in website.name or "cyol" in website.name:
|
||
# 中国青年报的文章页面判断逻辑 - 修复无法爬取问题
|
||
parsed_url = urlparse(url)
|
||
path = parsed_url.path
|
||
is_article_page = (
|
||
(soup.find("div", class_="content") is not None and
|
||
soup.find("h1") is not None) or
|
||
soup.find("div", class_="article-content") is not None or
|
||
(soup.find("div", id="content") is not None and
|
||
soup.find("h1") is not None) or
|
||
soup.find("div", class_="text") is not None or
|
||
soup.find("div", class_="main-content") is not None or
|
||
soup.find("div", class_="article") is not None or
|
||
soup.find("div", class_="article-body") is not None or
|
||
("/article/" in path) or
|
||
("/content/" in path) or
|
||
(path.startswith("/detail/") and len(path) > 10)
|
||
)
|
||
elif "中国妇女报" in website.name or "cnwomen" in website.name:
|
||
# 中国妇女报的文章页面判断逻辑 - 修复不保存文章内容问题
|
||
parsed_url = urlparse(url)
|
||
path = parsed_url.path
|
||
is_article_page = (
|
||
(soup.find("div", class_="content") is not None and
|
||
soup.find("h1") is not None) or
|
||
soup.find("div", class_="article-content") is not None or
|
||
(soup.find("div", id="content") is not None and
|
||
soup.find("h1") is not None) or
|
||
soup.find("div", class_="text") is not None or
|
||
soup.find("div", class_="main-content") is not None or
|
||
soup.find("div", class_="article") is not None or
|
||
soup.find("div", class_="article-body") is not None or
|
||
("/article/" in path) or
|
||
("/content/" in path) or
|
||
(path.startswith("/detail/") and len(path) > 10)
|
||
)
|
||
elif "法治日报" in website.name or "legaldaily" in website.name:
|
||
# 法治日报的文章页面判断逻辑 - 修复不保存正文和图片问题
|
||
parsed_url = urlparse(url)
|
||
path = parsed_url.path
|
||
is_article_page = (
|
||
(soup.find("div", class_="content-two") is not None and
|
||
soup.find("h1") is not None) or # 法治日报特有内容容器
|
||
(soup.find("div", class_="article-content") is not None and
|
||
soup.find("h1") is not None) or
|
||
(soup.find("div", class_="content") is not None and
|
||
soup.find("h1") is not None) or
|
||
(soup.find("div", id="content") is not None and
|
||
soup.find("h1") is not None) or
|
||
soup.find("div", class_="text") is not None or
|
||
soup.find("div", class_="main-content") is not None or
|
||
soup.find("div", class_="article") is not None or
|
||
soup.find("div", class_="article-body") is not None or
|
||
("/content/" in path and "content_" in path) or # 法治日报特有的文章URL模式
|
||
("/article/" in path) or
|
||
("/content/" in path) or
|
||
(path.startswith("/detail/") and len(path) > 10)
|
||
)
|
||
elif "农民日报" in website.name or "farmer" in website.name:
|
||
# 农民日报的文章页面判断逻辑 - 修复不保存正文和图片问题
|
||
parsed_url = urlparse(url)
|
||
path = parsed_url.path
|
||
is_article_page = (
|
||
(soup.find("div", class_="detailCon") is not None and
|
||
soup.find("h1") is not None) or # 农民日报特有内容容器
|
||
(soup.find("div", class_="article-content") is not None and
|
||
soup.find("h1") is not None) or
|
||
(soup.find("div", class_="content") is not None and
|
||
soup.find("h1") is not None) or
|
||
(soup.find("div", id="content") is not None and
|
||
soup.find("h1") is not None) or
|
||
soup.find("div", class_="text") is not None or
|
||
soup.find("div", class_="main-content") is not None or
|
||
soup.find("div", class_="article") is not None or
|
||
soup.find("div", class_="article-body") is not None or
|
||
("/article/" in path) or
|
||
("/content/" in path) or
|
||
(path.startswith("/detail/") and len(path) > 10)
|
||
)
|
||
elif "学习强国" in website.name or "xuexi" in website.name:
|
||
# 学习强国的文章页面判断逻辑 - 修复无法爬取问题
|
||
parsed_url = urlparse(url)
|
||
path = parsed_url.path
|
||
is_article_page = (
|
||
(soup.find("div", class_="content") is not None and
|
||
soup.find("h1") is not None) or
|
||
soup.find("div", class_="article-content") is not None or
|
||
(soup.find("div", id="content") is not None and
|
||
soup.find("h1") is not None) or
|
||
soup.find("div", class_="text") is not None or
|
||
soup.find("div", class_="main-content") is not None or
|
||
soup.find("div", class_="article") is not None or
|
||
soup.find("div", class_="article-body") is not None or
|
||
("/article/" in path) or
|
||
("/content/" in path) or
|
||
(path.startswith("/detail/") and len(path) > 10)
|
||
)
|
||
elif "旗帜网" in website.name or "qizhiwang" in website.name:
|
||
# 旗帜网的文章页面判断逻辑 - 修复不保存正文和图片问题
|
||
parsed_url = urlparse(url)
|
||
path = parsed_url.path
|
||
is_article_page = (
|
||
(soup.find("div", class_="w1200 flag-text-con clearfix") is not None and
|
||
soup.find("h1") is not None) or # 旗帜网特有内容容器
|
||
(soup.find("div", class_="article-content") is not None and
|
||
soup.find("h1") is not None) or
|
||
(soup.find("div", class_="content") is not None and
|
||
soup.find("h1") is not None) or
|
||
(soup.find("div", id="content") is not None and
|
||
soup.find("h1") is not None) or
|
||
soup.find("div", class_="main-content") is not None or
|
||
soup.find("div", class_="article") is not None or
|
||
soup.find("div", class_="article-body") is not None or
|
||
("/n1/" in path) or # 旗帜网文章URL特征
|
||
("/article/" in path) or
|
||
("/content/" in path) or
|
||
(path.startswith("/detail/") and len(path) > 10) or
|
||
# 简化判断:只要有h1标题就认为是文章页面
|
||
soup.find("h1") is not None
|
||
)
|
||
elif "中国网" in website.name or "china.com.cn" in website.name:
|
||
# 中国网的文章页面判断逻辑 - 修复不保存正文和图片问题
|
||
parsed_url = urlparse(url)
|
||
path = parsed_url.path
|
||
is_article_page = (
|
||
(soup.find("div", class_="main") is not None and
|
||
soup.find("h1") is not None) or # 中国网特有内容容器
|
||
(soup.find("div", class_="article-content") is not None and
|
||
soup.find("h1") is not None) or
|
||
(soup.find("div", class_="content") is not None and
|
||
soup.find("h1") is not None) or
|
||
(soup.find("div", id="content") is not None and
|
||
soup.find("h1") is not None) or
|
||
soup.find("div", class_="text") is not None or
|
||
soup.find("div", class_="main-content") is not None or
|
||
soup.find("div", class_="article") is not None or
|
||
soup.find("div", class_="article-body") is not None or
|
||
("/article/" in path) or
|
||
("/content/" in path) or
|
||
("/opinion/" in path) or # 中国网观点栏目
|
||
(path.startswith("/detail/") and len(path) > 10)
|
||
)
|
||
else:
|
||
# 默认判断逻辑
|
||
is_article_page = (
|
||
soup.find("div", class_="content") is not None or
|
||
soup.find("div", id="content") is not None
|
||
)
|
||
|
||
# 如果是文章页面,则调用文章处理
|
||
if is_article_page:
|
||
process_article(url, website)
|
||
pages_crawled += 1
|
||
|
||
# 扩展队列,发现新链接
|
||
for link in soup.find_all("a", href=True):
|
||
href = urljoin(url, link["href"])
|
||
# 对于人民日报网站,我们扩展链接发现逻辑
|
||
if website.name == "人民日报":
|
||
# 允许爬取以https://www.peopleapp.com/开头的链接
|
||
if href.startswith("https://www.peopleapp.com/") and href not in visited:
|
||
# 增加对文章链接的识别
|
||
parsed_href = urlparse(href)
|
||
href_path = parsed_href.path
|
||
# 添加更多可能的文章链接模式
|
||
if ("/article/" in href_path or
|
||
href_path.startswith("/detail/") or
|
||
("/dynamic/" in href_path and "article" in href_path) or
|
||
href_path.count("/") > 2): # 更深层
|
||
queue.append(href)
|
||
elif href not in visited and is_valid_url(href, base_netloc):
|
||
queue.append(href)
|