import os import requests from bs4 import BeautifulSoup from urllib.parse import urljoin, urlparse from collections import deque from django.utils import timezone from django.conf import settings from core.models import Article import re import time from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from webdriver_manager.chrome import ChromeDriverManager def get_selenium_driver(): """获取Selenium WebDriver实例""" try: chrome_options = Options() chrome_options.add_argument("--headless") # 无头模式 chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument("--disable-gpu") chrome_options.add_argument("--window-size=1920,1080") chrome_options.add_argument( "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") service = Service(ChromeDriverManager().install()) driver = webdriver.Chrome(service=service, options=chrome_options) return driver except Exception as e: print(f"创建Selenium WebDriver失败: {e}") return None def get_page_with_selenium(url, website_name): """使用Selenium获取动态加载的页面内容""" driver = None try: driver = get_selenium_driver() if not driver: return None print(f"使用Selenium加载页面: {url}") driver.get(url) # 等待页面加载完成 wait_time = 10 if "学习强国" in website_name: wait_time = 15 # 学习强国需要更长时间 elif "法治日报" in website_name: wait_time = 12 # 法治日报需要较长时间 elif "中国新闻社" in website_name or "chinanews" in website_name: wait_time = 12 # 中国新闻社需要较长时间 elif "中国政府网" in website_name or "gov.cn" in website_name: wait_time = 12 # 中国政府网需要较长时间 elif "工人日报" in website_name or "workercn" in website_name: wait_time = 12 # 工人日报需要较长时间 elif "经济日报" in website_name or "ce.cn" in website_name: wait_time = 12 # 经济日报需要较长时间 elif "求是" in website_name or "qstheory" in website_name: wait_time = 12 # 求是网需要较长时间 elif "旗帜网" in website_name or "qizhiwang" in website_name: wait_time = 12 # 旗帜网需要较长时间 elif "人民日报" in website_name or "people" in website_name: wait_time = 12 # 人民日报需要较长时间 elif "人民政协网" in website_name or "rmzxw" in website_name: wait_time = 12 # 人民政协网需要较长时间 elif "学习时报" in website_name or "studytimes" in website_name: wait_time = 12 # 学习时报需要较长时间 elif "中国妇女报" in website_name or "cnwomen" in website_name: wait_time = 12 # 中国妇女报需要较长时间 elif "中国青年报" in website_name or "cyol" in website_name: wait_time = 12 # 中国青年报需要较长时间 # 等待页面主要内容加载 try: WebDriverWait(driver, wait_time).until( EC.presence_of_element_located((By.TAG_NAME, "body")) ) except: print(f"等待页面加载超时: {url}") # 额外等待时间确保动态内容加载完成 time.sleep(3) # 获取页面源码 page_source = driver.page_source return page_source except Exception as e: print(f"Selenium获取页面失败: {url}, 错误: {e}") return None finally: if driver: try: driver.quit() except: pass def download_media(url, save_dir): try: # 添加请求头以避免403 Forbidden错误 headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", "Referer": urljoin(url, "/") } resp = requests.get(url, timeout=15, headers=headers) resp.raise_for_status() except Exception as e: print(f"下载失败:{url},错误:{e}") return None # 更安全地处理文件名,去除查询参数并处理特殊字符 parsed_url = urlparse(url) filename = os.path.basename(parsed_url.path) if not filename or '.' not in filename: # 如果URL路径中没有有效的文件名,使用默认名称 filename = 'media_file' # 清理文件名中的特殊字符 filename = re.sub(r'[^\w\-_\.]', '_', filename) # 确保文件有扩展名 if '.' not in filename: content_type = resp.headers.get('content-type', '') if 'image/jpeg' in content_type: filename += '.jpg' elif 'image/png' in content_type: filename += '.png' elif 'image/gif' in content_type: filename += '.gif' elif 'video/mp4' in content_type: filename += '.mp4' elif 'video/avi' in content_type: filename += '.avi' elif 'video/quicktime' in content_type: filename += '.mov' else: filename += '.bin' # 默认二进制扩展名 os.makedirs(save_dir, exist_ok=True) filepath = os.path.join(save_dir, filename) base, ext = os.path.splitext(filename) counter = 1 while os.path.exists(filepath): filename = f"{base}_{counter}{ext}" filepath = os.path.join(save_dir, filename) counter += 1 with open(filepath, "wb") as f: f.write(resp.content) return filepath def process_article(url, website): # 检查文章是否已存在,如果存在则跳过 if Article.objects.filter(url=url).exists(): print(f"文章已存在,跳过: {url}") return headers = {"User-Agent": "Mozilla/5.0"} # 针对不同网站设置特殊的请求头 if "人民网" in website.name or "人民日报" in website.name: headers.update({ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1" }) # 添加光明日报的特殊请求头 elif "光明日报" in website.name or "gmw.cn" in website.name: headers.update({ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Referer": "https://www.gmw.cn/" }) # 添加央视网的特殊请求头 elif "央视" in website.name or "CCTV" in website.name or "cctv" in website.name: headers.update({ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Referer": "https://news.cctv.com/" }) # 添加中国网的特殊请求头 elif "中国网" in website.name or "china.com.cn" in website.name: headers.update({ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Referer": "http://www.china.com.cn/" }) # 添加法治日报的特殊请求头 elif "法治日报" in website.name or "legaldaily" in website.name: headers.update({ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Referer": "http://www.legaldaily.com.cn/" }) # 添加工人日报的特殊请求头 elif "工人日报" in website.name or "workercn" in website.name: headers.update({ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Referer": "https://www.workercn.cn/" }) # 添加农民日报的特殊请求头 elif "农民日报" in website.name or "farmer" in website.name: headers.update({ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Referer": "https://www.farmer.com.cn/" }) # 添加解放军报的特殊请求头 elif "解放军报" in website.name or "81.cn" in website.name: headers.update({ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Referer": "http://www.81.cn/" }) # 添加旗帜网的特殊请求头 elif "旗帜网" in website.name or "qizhiwang" in website.name: headers.update({ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Referer": "http://www.qizhiwang.org.cn/" }) # 添加中国新闻社的特殊请求头 elif "中国新闻社" in website.name or "chinanews" in website.name: headers.update({ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Referer": "https://www.chinanews.com.cn/" }) # 添加中国政府网的特殊请求头 elif "中国政府网" in website.name or "gov.cn" in website.name: headers.update({ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Referer": "https://www.gov.cn/" }) # 添加经济日报的特殊请求头 elif "经济日报" in website.name or "ce.cn" in website.name: headers.update({ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Referer": "http://www.ce.cn/" }) # 添加求是网的特殊请求头 elif "求是" in website.name or "qstheory" in website.name: headers.update({ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Referer": "http://www.qstheory.cn/" }) # 添加人民政协网的特殊请求头 elif "人民政协网" in website.name or "rmzxw" in website.name: headers.update({ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Referer": "https://www.rmzxw.com.cn/" }) # 添加学习时报的特殊请求头 elif "学习时报" in website.name or "studytimes" in website.name: headers.update({ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Referer": "https://www.studytimes.cn/" }) # 添加中国妇女报的特殊请求头 elif "中国妇女报" in website.name or "cnwomen" in website.name: headers.update({ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Referer": "https://www.cnwomen.com.cn/" }) # 添加中国青年报的特殊请求头 elif "中国青年报" in website.name or "cyol" in website.name: headers.update({ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Referer": "http://news.cyol.com/" }) # 判断是否需要使用Selenium need_selenium = False if any(name in website.name for name in ["中国妇女报", "cnwomen", "中国纪检监察报", "jjjcb", "中国青年报", "cyol"]): need_selenium = True try: if need_selenium: # 使用Selenium获取动态加载的内容 page_source = get_page_with_selenium(url, website.name) if not page_source: print(f"Selenium获取页面失败:{url}") return # 检查页面内容是否过短 min_length = 200 if len(page_source) < min_length: print(f"页面内容过短,可能是重定向页面:{url}") return # 创建BeautifulSoup对象 soup = BeautifulSoup(page_source, "html.parser") else: # 使用requests获取静态内容 resp = requests.get(url, headers=headers, timeout=15, allow_redirects=True) resp.raise_for_status() # 检查是否是重定向页面 if len(resp.text) < 300: print(f"页面内容过短,可能是重定向页面:{url}") return # 针对不同网站设置正确的编码 if "央视" in website.name or "CCTV" in website.name or "cctv" in website.name: resp.encoding = 'utf-8' elif "中国网" in website.name or "china.com.cn" in website.name: resp.encoding = 'utf-8' elif "中国新闻社" in website.name or "chinanews" in website.name: resp.encoding = 'utf-8' elif "中国政府网" in website.name or "gov.cn" in website.name: resp.encoding = 'utf-8' elif "工人日报" in website.name or "workercn" in website.name: resp.encoding = 'utf-8' elif "经济日报" in website.name or "ce.cn" in website.name: resp.encoding = 'utf-8' elif "求是" in website.name or "qstheory" in website.name: resp.encoding = 'utf-8' elif "旗帜网" in website.name or "qizhiwang" in website.name: resp.encoding = 'utf-8' elif "人民日报" in website.name or "people" in website.name: resp.encoding = 'utf-8' elif "人民政协网" in website.name or "rmzxw" in website.name: resp.encoding = 'utf-8' elif "学习时报" in website.name or "studytimes" in website.name: resp.encoding = 'utf-8' elif "中国妇女报" in website.name or "cnwomen" in website.name: resp.encoding = 'utf-8' elif "中国青年报" in website.name or "cyol" in website.name: resp.encoding = 'utf-8' elif "学习强国" in website.name or "xuexi" in website.name: resp.encoding = 'utf-8' elif "法治日报" in website.name or "legaldaily" in website.name: resp.encoding = 'utf-8' else: resp.encoding = 'utf-8' # 创建BeautifulSoup对象 soup = BeautifulSoup(resp.text, "html.parser") except Exception as e: print(f"请求失败:{url},错误:{e}") return # 针对不同网站设置正确的编码(仅对requests获取的内容) # 注释掉原有的编码处理逻辑,统一使用UTF-8 # if not need_selenium: # if "人民网" in website.name or "人民日报" in website.name: # resp.encoding = 'utf-8' # elif "新华网" in website.name: # resp.encoding = 'utf-8' # elif "农民日报" in website.name or "farmer" in website.name: # resp.encoding = 'utf-8' # # 尝试其他编码 # if '锘' in resp.text or len(resp.text) < 1000: # resp.encoding = 'gbk' # # 进一步尝试其他编码 # if '锘' in resp.text or '锘' in resp.text: # resp.encoding = 'gb2312' # # 如果还是有问题,尝试更多编码 # if '锘' in resp.text or '锘' in resp.text: # resp.encoding = 'utf-8-sig' # # 最后尝试 # if '锘' in resp.text or '锘' in resp.text: # resp.encoding = 'big5' # else: # resp.encoding = 'utf-8' # 统一设置编码为UTF-8,解决乱码问题 #if not need_selenium: # resp.encoding = 'utf-8' # 处理不同网站的文章结构 if website.name == "新华网" or website.name == "www.news.cn" or "新华网" in website.name: # 新华网的文章结构处理 - 修复不保存文章内容问题 title_tag = ( soup.find("h1", class_="title") or soup.find("h1") or soup.find("div", class_="title") or soup.find("title") ) content_tag = ( soup.find("div", class_="article-content") or soup.find("div", class_="content") or soup.find("div", id="content") or soup.find("div", class_="article") or soup.find("div", class_="main-content") or soup.find("span", id="detailContent") # 添加新华网特有的内容容器 ) elif website.name == "东方烟草报": # 优化东方烟草报的标题提取逻辑,按优先级尝试多种选择器 title_tag = ( soup.find("h1", id="title") or # 特别针对带id="title"的h1标签 soup.find("h1") or # 主要标题标签 soup.find("title") or # 页面title标签 soup.find("div", class_="title") or # 某些页面可能使用div.title soup.find("h2") # 备选标题标签 ) content_tag = soup.find("div", class_="content") # 东方烟草报的内容通常在div.content中 # 增加对另一种内容结构的支持 if not content_tag: content_tag = soup.find("div", id="gallery") # 再增加对新内容结构的支持 if not content_tag: content_tag = soup.find("div", id="ContentText") elif website.name == "www.gov.cn": # 中国政府网的文章结构处理 - 修复标题重复问题 title_tag = ( soup.find("h1", class_="title") or soup.find("h1") or soup.find("title") ) # 查找主要内容区域,通常在.mainBody或content中 content_tag = ( soup.find("div", class_="pages_content") or soup.find("div", class_="article_con") or soup.find("div", class_="content") or soup.find("div", id="content") or soup.find("div", class_="mainBody") ) # 针对中国政府网的特殊处理,清理内容中的重复标题 if content_tag and title_tag: title_text = title_tag.get_text(strip=True) if title_text: # 移除内容中的重复标题元素 for heading in content_tag.find_all(["h1", "h2", "h3"]): heading_text = heading.get_text(strip=True) if title_text in heading_text or heading_text in title_text: heading.decompose() # 移除class包含title的元素 for title_element in content_tag.find_all(class_=lambda x: x and "title" in x): title_element_text = title_element.get_text(strip=True) if title_text in title_element_text or title_element_text in title_text: title_element.decompose() # 移除id为ti的元素(中国政府网特有的标题元素) for ti_element in content_tag.find_all(id="ti"): ti_element.decompose() # 移除包含"简历"等关键词的重复标题 for element in content_tag.find_all(["h1", "h2", "h3", "strong", "b"]): element_text = element.get_text(strip=True) if "简历" in element_text and len(element_text) < 20: element.decompose() # 移除编辑信息 for editor_element in content_tag.find_all("div", class_="editor"): editor_element.decompose() # 移除分享相关元素 for share_element in content_tag.find_all("div", class_="share"): share_element.decompose() # 移除script标签 for script_element in content_tag.find_all("script"): script_element.decompose() # 移除样式标签 for style_element in content_tag.find_all("style"): style_element.decompose() elif "人民日报" in website.name or "人民网" in website.name: # 人民日报网站的文章结构处理 - 修复乱码和404问题 title_tag = ( soup.find("h1", class_="title") or soup.find("h1") or soup.find("title") ) # 查找主要内容区域 content_tag = ( soup.find("div", class_="content") or soup.find("div", class_="article-content") or soup.find("div", id="content") or soup.find("div", class_="text") or soup.find("section", class_="content") or soup.find("div", class_="article") or soup.find("div", class_="rm_txt_con") or # 添加人民网特有的内容容器 soup.find("div", class_="text_c") or # 添加新的内容容器 soup.find("div", class_="article-detail") or # 人民日报文章详情容器 soup.find("div", class_="detail-content") or # 人民日报详情内容容器 soup.find("div", class_="article-text") or # 人民日报文章文本容器 soup.find("div", class_="content-text") or # 人民日报内容文本容器 soup.find("div", class_="news-content") or # 人民日报新闻内容容器 soup.find("div", class_="news-text") or # 人民日报新闻文本容器 soup.find("div", class_="news-detail") or # 人民日报新闻详情容器 soup.find("div", class_="article-main") or # 人民日报文章主体容器 soup.find("div", class_="article-container") or # 人民日报文章容器 soup.find("div", class_="content-container") or # 人民日报内容容器 soup.find("div", class_="text-container") or # 人民日报文本容器 soup.find("div", class_="main-container") # 人民日报主体容器 ) # 针对人民网的特殊处理,清理内容中的无关元素 if content_tag: # 移除编辑信息 for editor_element in content_tag.find_all("div", class_="edit"): editor_element.decompose() # 移除分享相关元素 for share_element in content_tag.find_all("p", class_="paper_num"): share_element.decompose() # 移除无关的box_pic元素 for pic_element in content_tag.find_all("div", class_="box_pic"): pic_element.decompose() # 移除无关的zdfy元素 for zdfy_element in content_tag.find_all("div", class_="zdfy"): zdfy_element.decompose() # 移除无关的center元素 for center_element in content_tag.find_all("center"): center_element.decompose() # 移除无关的bza元素 for bza_element in content_tag.find_all("div", class_="bza"): bza_element.decompose() # 移除隐藏的无关元素 for hidden_element in content_tag.find_all(attrs={"style": "display: none;"}): hidden_element.decompose() # 移除相关专题 for related_element in content_tag.find_all("div", id="rwb_tjyd"): related_element.decompose() # 移除推荐阅读 for recommend_element in content_tag.find_all("div", class_="clearfix box_cai"): recommend_element.decompose() # 移除相关专题列表 for topic_element in content_tag.find_all("div", class_="clearfix text_like"): topic_element.decompose() elif "央视" in website.name or "CCTV" in website.name or "cctv" in website.name: # 央视网站的文章结构处理 - 修复不保存正文和图片问题 title_tag = ( soup.find("h1", class_="title") or soup.find("h1", class_="title_text") or # 央视网特有标题类 soup.find("h1") or soup.find("title") ) content_tag = ( soup.find("div", class_="content_area") or # 央视网特有内容容器 soup.find("div", class_="content") or soup.find("div", class_="article-content") or soup.find("div", id="content") or soup.find("div", class_="text") or soup.find("div", class_="main-content") or soup.find("div", class_="article") or soup.find("div", class_="article-body") or soup.find("div", class_="article-detail") or # 央视网文章详情容器 soup.find("div", class_="detail-content") or # 央视网详情内容容器 soup.find("div", class_="article-text") or # 央视网文章文本容器 soup.find("div", class_="content-text") or # 央视网内容文本容器 soup.find("div", class_="news-content") or # 央视网新闻内容容器 soup.find("div", class_="news-text") or # 央视网新闻文本容器 soup.find("div", class_="news-detail") or # 央视网新闻详情容器 soup.find("div", class_="article-main") or # 央视网文章主体容器 soup.find("div", class_="article-container") or # 央视网文章容器 soup.find("div", class_="content-container") or # 央视网内容容器 soup.find("div", class_="text-container") or # 央视网文本容器 soup.find("div", class_="main-container") # 央视网主体容器 ) # 针对央视网的特殊处理,清理内容中的无关元素 if content_tag: # 移除编辑信息 for editor_element in content_tag.find_all("div", class_="editor"): editor_element.decompose() # 移除分享相关元素 for share_element in content_tag.find_all("div", class_="share"): share_element.decompose() # 移除相关推荐 for recommend_element in content_tag.find_all("div", class_="related"): recommend_element.decompose() # 移除评论区域 for comment_element in content_tag.find_all("div", class_="comment"): comment_element.decompose() # 移除script标签 for script_element in content_tag.find_all("script"): script_element.decompose() # 移除样式标签 for style_element in content_tag.find_all("style"): style_element.decompose() elif "求是" in website.name: # 求是网站的文章结构处理 - 修复标题和正文清理问题 title_tag = ( soup.find("h1", class_="title") or soup.find("h2", class_="title") or soup.find("h1") or soup.find("h2") or soup.find("p", class_="title") or soup.find("title") ) # 针对求是的特殊处理,如果标题为空或太短,尝试从title标签提取 if title_tag: title_text = title_tag.get_text(strip=True) if not title_text or len(title_text) < 5: title_tag = soup.find("title") # 针对求是的特殊处理,确保标题被正确提取 if not title_tag or not title_tag.get_text(strip=True): title_tag = soup.find("title") # 针对求是的特殊处理,如果标题包含"海报"等关键词,尝试从内容中提取更好的标题 if title_tag: title_text = title_tag.get_text(strip=True) if "海报" in title_text or "图" in title_text: # 尝试从内容中查找更好的标题 content_h1 = soup.find("h1") if content_h1 and content_h1 != title_tag: content_title = content_h1.get_text(strip=True) if len(content_title) > len(title_text) and "海报" not in content_title: title_tag = content_h1 content_tag = ( soup.find("div", class_="content") or soup.find("div", class_="article-content") or soup.find("div", id="content") or soup.find("div", class_="text") or soup.find("div", class_="main-content") or soup.find("div", class_="article") ) if content_tag: # 移除重复标题(放宽匹配条件,允许部分匹配) if title_tag: title_text = title_tag.get_text(strip=True) if title_text: # 移除所有可能的重复标题元素 for strong_tag in content_tag.find_all("strong"): strong_text = strong_tag.get_text(strip=True) if strong_text and (title_text in strong_text or strong_text in title_text): parent_p = strong_tag.find_parent("p") # 如果 strong 在正文前两段内,就删除 if parent_p in content_tag.find_all("p")[:2]: strong_tag.decompose() # 移除h1、h2、h3标题元素中的重复标题 for heading in content_tag.find_all(["h1", "h2", "h3"]): heading_text = heading.get_text(strip=True) if heading_text and (title_text in heading_text or heading_text in title_text): # 确保不删除title_tag本身 if heading != title_tag: heading.decompose() # 移除class包含title的元素 for title_element in content_tag.find_all(class_=lambda x: x and "title" in x): title_element_text = title_element.get_text(strip=True) if title_element_text and ( title_text in title_element_text or title_element_text in title_text): # 确保不删除title_tag本身 if title_element != title_tag: title_element.decompose() # 移除分享相关元素 for share_element in content_tag.find_all("div", class_="sharebox"): share_element.decompose() # 移除二维码相关元素 for qr_element in content_tag.find_all("div", class_="xl_ewm"): qr_element.decompose() # 移除编辑信息 for editor_element in content_tag.find_all("div", class_="fs-text"): text = editor_element.get_text(strip=True) if text and ("网站编辑" in text or "审核" in text): editor_element.decompose() # 移除声明链接 for declare_element in content_tag.find_all("a", href=lambda x: x and "qssyggw" in x): declare_element.decompose() # 移除clearfix等无关div for clear_element in content_tag.find_all("div", class_="clear"): clear_element.decompose() # 移除分隔线 for line_element in content_tag.find_all("div", class_="fs-line"): line_element.decompose() for line_element in content_tag.find_all("div", class_="fs-line_b"): line_element.decompose() # unwrap 剪贴板相关元素(保留文字,去掉外层标签) for clipboard_element in content_tag.find_all("div", class_="clipboard_text"): clipboard_element.unwrap() # unwrap highlight 包装层(保留文字) for highlight_element in content_tag.find_all("div", class_="highlight"): highlight_element.unwrap() elif "解放军报" in website.name or "81.cn" in website.name: # 解放军报的文章结构处理 - 修复有视频的文章不被爬取问题 title_tag = ( soup.find("h1", class_="title") or soup.find("h1", class_="article-title") or # 解放军报特有标题类 soup.find("h1") or soup.find("h2") or # 解放军报使用h2标签作为标题 soup.find("title") ) # 针对解放军报的特殊处理,如果标题为空或太短,尝试从title标签提取 if title_tag: title_text = title_tag.get_text(strip=True) if not title_text or len(title_text) < 5: title_tag = soup.find("title") content_tag = ( soup.find("div", id="article-content") or # 解放军报实际文章内容容器 soup.find("div", class_="text") or soup.find("div", class_="article-content") or soup.find("div", class_="content") or soup.find("div", id="content") or soup.find("div", class_="main-content") or soup.find("div", class_="article") or soup.find("div", class_="article-body") or soup.find("div", class_="artichle-info") # 作为备选 ) # 针对解放军报的特殊处理,清理内容中的无关元素 if content_tag: # 移除面包屑导航 for breadcrumb in content_tag.find_all("ol", class_="breadcrumb"): breadcrumb.decompose() # 移除分享相关元素 for share_element in content_tag.find_all("div", class_="share-custom"): share_element.decompose() # 移除作者信息段落 for author_p in content_tag.find_all("p"): text = author_p.get_text(strip=True) if "来源:" in text or "作者:" in text or "责任编辑:" in text or "发布:" in text: author_p.decompose() # 移除进度条 for progress in content_tag.find_all("div", class_="progress-bar"): progress.decompose() # 移除播放器 for player in content_tag.find_all("div", class_="player"): player.decompose() # 移除媒体URL容器 for media in content_tag.find_all("div", id="mediaurl"): media.decompose() # 移除新闻列表(但保留其中的内容) for news_list in content_tag.find_all("ul", id="main-news-list"): # 不删除整个ul,而是unwrap它,保留其中的内容 news_list.unwrap() # 移除编辑信息 for editor_element in content_tag.find_all("div", class_="editor"): editor_element.decompose() # 移除分享相关元素 for share_element in content_tag.find_all("div", class_="share"): share_element.decompose() # 移除相关推荐 for recommend_element in content_tag.find_all("div", class_="related"): recommend_element.decompose() # 移除script标签 for script_element in content_tag.find_all("script"): script_element.decompose() # 移除样式标签 for style_element in content_tag.find_all("style"): style_element.decompose() elif "光明日报" in website.name or "gmw.cn" in website.name: # 光明日报的文章结构处理 - 修复不保存文章内容问题 title_tag = ( soup.find("h1", class_="title") or soup.find("h1") or soup.find("title") ) content_tag = ( soup.find("div", class_="content") or soup.find("div", class_="article-content") or soup.find("div", id="content") or soup.find("div", class_="text") or soup.find("div", class_="main-content") or soup.find("div", class_="article") or soup.find("div", class_="article-body") ) elif "经济日报" in website.name or "ce.cn" in website.name: # 经济日报的文章结构处理 title_tag = ( soup.find("h1", class_="title") or soup.find("h1") or soup.find("title") ) content_tag = ( soup.find("div", class_="content") or soup.find("div", class_="article-content") or soup.find("div", id="content") or soup.find("div", class_="text") or soup.find("div", class_="main-content") or soup.find("div", class_="article") ) elif "中国日报" in website.name or "chinadaily" in website.name: # 中国日报的文章结构处理 - 修复不保存文章内容问题 title_tag = ( soup.find("h1", class_="title") or soup.find("h1") or soup.find("title") ) content_tag = ( soup.find("div", id="Content") or # 中国日报特有内容容器 soup.find("div", class_="content") or soup.find("div", class_="article-content") or soup.find("div", id="content") or soup.find("div", class_="text") or soup.find("div", class_="main-content") or soup.find("div", class_="article") or soup.find("div", class_="article-body") ) # 处理中国日报的分页内容 if content_tag and ("中国日报" in website.name or "chinadaily" in website.name): # 查找分页链接 page_links = [] current_page_elem = soup.find("div", id="div_currpage") if current_page_elem: # 查找所有分页链接 page_links = [a for a in current_page_elem.find_all("a", href=True) if not a.find("img")] # 如果有分页,收集所有页面内容 if page_links: print(f"发现分页内容,共 {len(page_links)} 页需要处理") # 收集所有页面的内容 all_content_html = str(content_tag) # 处理每个分页链接 for page_link in page_links: page_url = urljoin(url, page_link['href']) if page_url != url: # 避免重复处理第一页 try: page_resp = requests.get(page_url, headers=headers, timeout=15) page_resp.encoding = 'utf-8' page_soup = BeautifulSoup(page_resp.text, "html.parser") # 提取分页内容 page_content = page_soup.find("div", id="Content") if page_content: all_content_html += str(page_content) print(f"已处理分页: {page_url}") except Exception as e: print(f"处理分页失败 {page_url}: {e}") # 更新content_tag为包含所有分页内容 content_tag = BeautifulSoup(all_content_html, "html.parser") elif "工人日报" in website.name or "workercn" in website.name: # 工人日报的文章结构处理 - 修复不保存正文和图片问题 title_tag = ( soup.find("h1", class_="title") or soup.find("h1", class_="article-title") or # 工人日报特有标题类 soup.find("h1") or soup.find("title") ) # 针对工人日报的特殊处理,如果标题为空或太短,尝试从title标签提取 if title_tag: title_text = title_tag.get_text(strip=True) if not title_text or len(title_text) < 5: title_tag = soup.find("title") # 进一步处理:如果h1标题包含太多无关信息,尝试从title标签提取更简洁的标题 if title_tag and title_tag.name == 'h1': title_text = title_tag.get_text(strip=True) if title_text and len(title_text) > 50: # 如果h1标题太长 title_tag = soup.find("title") content_tag = ( soup.find("div", class_="ccontent") or # 工人日报特有内容容器 soup.find("div", class_="article-content") or soup.find("div", class_="content") or soup.find("div", id="content") or soup.find("div", class_="text") or soup.find("div", class_="main-content") or soup.find("div", class_="article") or soup.find("div", class_="article-body") ) # 针对工人日报的特殊处理,清理内容中的无关元素 if content_tag: # 移除编辑信息 for editor_element in content_tag.find_all("div", class_="editor"): editor_element.decompose() # 移除分享相关元素 for share_element in content_tag.find_all("div", class_="share"): share_element.decompose() # 移除相关推荐 for recommend_element in content_tag.find_all("div", class_="related"): recommend_element.decompose() # 移除script标签 for script_element in content_tag.find_all("script"): script_element.decompose() # 移除样式标签 for style_element in content_tag.find_all("style"): style_element.decompose() elif "科技日报" in website.name or "stdaily" in website.name: # 科技日报的文章结构处理 - 修复无法爬取问题 title_tag = ( soup.find("h1", class_="title") or soup.find("h1") or soup.find("title") ) content_tag = ( soup.find("div", class_="content") or soup.find("div", class_="article-content") or soup.find("div", id="content") or soup.find("div", class_="text") or soup.find("div", class_="main-content") or soup.find("div", class_="article") or soup.find("div", class_="article-body") ) elif "人民政协报" in website.name or "rmzxb" in website.name: # 人民政协报的文章结构处理 - 修复爬取错误问题 title_tag = ( soup.find("h1", class_="Content_title") or # 添加人民政协网特有标题类 soup.find("h1", class_="title") or soup.find("h1") or soup.find("title") ) # 特殊处理人民政协网的标题结构 if title_tag and title_tag.find("span", id="a"): title_tag = title_tag.find("span", id="a") elif title_tag and (title_tag.get_text(strip=True) == "首页>聚焦" or title_tag.get_text(strip=True) == "首页 > 聚焦"): # 如果标题还是"首页>聚焦",尝试从内容中提取标题 # 查找文章正文中的第一个strong标签作为标题 content_div = soup.find("div", class_="text_box") if content_div: first_p = content_div.find("p") if first_p and first_p.find("strong"): title_text = first_p.find("strong").get_text().strip() # 创建一个虚拟的title_tag对象 title_tag = first_p.find("strong") else: # 如果没有找到strong标签,尝试查找内容中的第一个h2标签 first_h2 = content_div.find("h2") if first_h2: title_tag = first_h2 # 针对人民政协网的特殊处理,如果标题包含"首页>聚焦",尝试从页面中查找更好的标题 if title_tag and ("首页>聚焦" in title_tag.get_text(strip=True) or "首页 > 聚焦" in title_tag.get_text(strip=True)): # 尝试从页面中查找其他可能的标题 for h in soup.find_all(["h1", "h2", "h3"]): h_text = h.get_text(strip=True) if h_text and "首页>聚焦" not in h_text and "首页 > 聚焦" not in h_text and len(h_text) > 5: title_tag = h break # 如果还是没找到,尝试从title标签提取 if "首页>聚焦" in title_tag.get_text(strip=True) or "首页 > 聚焦" in title_tag.get_text(strip=True): page_title = soup.find("title") if page_title: title_text = page_title.get_text(strip=True) # 移除网站名称等后缀信息 if " - 人民政协网" in title_text: title_text = title_text.split(" - 人民政协网")[0] if "首页>聚焦" not in title_text and "首页 > 聚焦" not in title_text and len(title_text) > 5: title_tag = page_title content_tag = ( soup.find("div", class_="content") or soup.find("div", class_="article-content") or soup.find("div", id="content") or soup.find("div", class_="text") or soup.find("div", class_="main-content") or soup.find("div", class_="article") or soup.find("div", class_="article-body") or soup.find("div", class_="text_box") # 添加人民政协网特有内容容器 ) # 针对人民政协网的特殊处理,清理内容中的无关元素 if content_tag: # 移除编辑信息 for editor_element in content_tag.find_all("p", class_="Editor"): editor_element.decompose() # 移除分享相关元素 for share_element in content_tag.find_all("div", class_="share"): share_element.decompose() # 移除Remark元素 for remark_element in content_tag.find_all("div", class_="Remark"): remark_element.decompose() # 移除Paging元素 for paging_element in content_tag.find_all("div", class_="Paging"): paging_element.decompose() # 移除政协号客户端下载提示 for zxh_element in content_tag.find_all("div", style=lambda x: x and "background:#F9F9F9;padding:50px" in x): zxh_element.decompose() # 移除版权信息 for copyright_element in content_tag.find_all("div", class_="copyright"): copyright_element.decompose() # 移除script标签 for script_element in content_tag.find_all("script"): script_element.decompose() # 移除样式标签 for style_element in content_tag.find_all("style"): style_element.decompose() elif "中国纪检监察报" in website.name or "jjjcb" in website.name: # 中国纪检监察报的文章结构处理 - 修复无法爬取问题 title_tag = ( soup.find("h1", class_="title") or soup.find("h1") or soup.find("title") ) # 针对中国纪检监察报的特殊处理,确保标题被正确提取 if not title_tag or not title_tag.get_text(strip=True): title_tag = soup.find("title") content_tag = ( soup.find("div", class_="content") or soup.find("div", class_="article-content") or soup.find("div", id="content") or soup.find("div", class_="text") or soup.find("div", class_="main-content") or soup.find("div", class_="article") or soup.find("div", class_="article-body") or soup.find("div", class_="main") or soup.find("div", class_="detail") or soup.find("div", class_="article_yt") # 中国纪检监察报特有内容容器 ) elif "中国新闻社" in website.name or "chinanews" in website.name: # 中国新闻社的文章结构处理 - 修复爬取非文章部分问题 title_tag = ( soup.find("h1", class_="title") or soup.find("h1") or soup.find("title") ) # 修改内容选择器,更精确地定位文章正文区域 content_tag = ( soup.find("div", class_="left_zw") or # 中国新闻网文章正文区域 soup.find("div", class_="content") or soup.find("div", class_="article-content") or soup.find("div", id="content") or soup.find("div", class_="text") or soup.find("div", class_="main-content") or soup.find("div", class_="article") or soup.find("div", class_="article-body") ) elif "学习时报" in website.name or "studytimes" in website.name: # 学习时报的文章结构处理 - 修复不保存文章内容问题 title_tag = ( soup.find("h1", class_="title") or soup.find("h1") or soup.find("title") ) content_tag = ( soup.find("div", id="detail") or # 添加学习时报特有内容容器 soup.find("div", class_="content") or soup.find("div", class_="article-content") or soup.find("div", id="content") or soup.find("div", class_="text") or soup.find("div", class_="main-content") or soup.find("div", class_="article") or soup.find("div", class_="article-body") ) # 针对学习时报的特殊处理,清理内容中的无关元素 if content_tag: # 移除编辑信息 for editor_element in content_tag.find_all("div", class_="editor"): editor_element.decompose() # 移除分享相关元素 for share_element in content_tag.find_all("div", class_="share"): share_element.decompose() # 移除无关的TRS_Editor包装层 for trs_editor in content_tag.find_all("div", class_="TRS_Editor"): trs_editor.unwrap() # unwrap只移除标签,保留内容 # 移除Custom_UnionStyle包装层 for custom_style in content_tag.find_all("div", class_="Custom_UnionStyle"): custom_style.unwrap() # unwrap只移除标签,保留内容 elif "中国青年报" in website.name or "cyol" in website.name: # 中国青年报的文章结构处理 - 修复无法爬取问题 title_tag = ( soup.find("h1", class_="title") or soup.find("h1") or soup.find("title") ) # 针对中国青年报的特殊处理,确保标题被正确提取 if not title_tag or not title_tag.get_text(strip=True): title_tag = soup.find("title") content_tag = ( soup.find("div", class_="main") or # 中国青年报特有内容容器 soup.find("div", class_="content") or soup.find("div", class_="article-content") or soup.find("div", id="content") or soup.find("div", class_="text") or soup.find("div", class_="main-content") or soup.find("div", class_="article") or soup.find("div", class_="article-body") or soup.find("div", class_="article-detail") or # 中国青年报文章详情容器 soup.find("div", class_="detail-content") or # 中国青年报详情内容容器 soup.find("div", class_="article-text") or # 中国青年报文章文本容器 soup.find("div", class_="content-text") or # 中国青年报内容文本容器 soup.find("div", class_="news-content") or # 中国青年报新闻内容容器 soup.find("div", class_="news-text") or # 中国青年报新闻文本容器 soup.find("div", class_="news-detail") # 中国青年报新闻详情容器 ) elif "中国妇女报" in website.name or "cnwomen" in website.name: # 中国妇女报的文章结构处理 - 修复不保存文章内容问题 title_tag = ( soup.find("h1", class_="title") or soup.find("h1") or soup.find("title") ) # 针对中国妇女报的特殊处理,确保标题被正确提取 if not title_tag or not title_tag.get_text(strip=True): title_tag = soup.find("title") content_tag = ( soup.find("div", class_="f_container") or # 中国妇女报特有内容容器 soup.find("div", class_="f_container_left") or # 中国妇女报特有内容容器 soup.find("div", class_="f_navigation_bars") or # 中国妇女报特有内容容器 soup.find("div", class_="main") or # 中国妇女报特有内容容器 soup.find("div", class_="news") or # 中国妇女报特有内容容器 soup.find("div", class_="content") or soup.find("div", class_="article-content") or soup.find("div", id="content") or soup.find("div", class_="text") or soup.find("div", class_="main-content") or soup.find("div", class_="article") or soup.find("div", class_="article-body") or soup.find("div", class_="article-detail") or # 中国妇女报文章详情容器 soup.find("div", class_="detail-content") or # 中国妇女报详情内容容器 soup.find("div", class_="article-text") or # 中国妇女报文章文本容器 soup.find("div", class_="content-text") or # 中国妇女报内容文本容器 soup.find("div", class_="news-content") or # 中国妇女报新闻内容容器 soup.find("div", class_="news-text") or # 中国妇女报新闻文本容器 soup.find("div", class_="news-detail") or # 中国妇女报新闻详情容器 soup.find("div", class_="article-main") or # 中国妇女报文章主体容器 soup.find("div", class_="article-container") or # 中国妇女报文章容器 soup.find("div", class_="content-container") or # 中国妇女报内容容器 soup.find("div", class_="text-container") or # 中国妇女报文本容器 soup.find("div", class_="main-container") # 中国妇女报主体容器 ) elif "法治日报" in website.name or "legaldaily" in website.name: # 法治日报的文章结构处理 - 修复不保存正文和图片问题 title_tag = ( soup.find("h1", class_="title") or soup.find("h1", class_="article-title") or # 法治日报特有标题类 soup.find("h1") or soup.find("title") ) # 针对法治日报的特殊处理,确保标题被正确提取 if not title_tag or not title_tag.get_text(strip=True): title_tag = soup.find("title") content_tag = ( soup.find("div", class_="content-two") or # 优先查找content-two类 soup.find("div", class_="article-content") or # 法治日报特有内容容器 soup.find("div", class_="content") or soup.find("div", id="content") or soup.find("div", class_="text") or soup.find("div", class_="main-content") or soup.find("div", class_="article") or soup.find("div", class_="article-body") or soup.find("div", class_="article-detail") or # 法治日报特有内容容器 soup.find("div", class_="detail-content") or # 法治日报特有内容容器 soup.find("div", class_="article-text") # 法治日报特有内容容器 ) # 针对法治日报的特殊处理,清理内容中的无关元素 if content_tag: # 如果找到content-two,需要进一步处理去除内部的标题元素(避免重复) if content_tag.get('class') and 'content-two' in content_tag.get('class', []): # 查找并移除内容中的标题元素(避免重复) inner_titles = content_tag.find_all(['h1', 'h2']) title_text = title_tag.get_text(strip=True) if title_tag else "无标题" for inner_title in inner_titles: if inner_title.get_text().strip() == title_text: inner_title.decompose() # 移除编辑信息 for editor_element in content_tag.find_all("div", class_="editor"): editor_element.decompose() # 移除分享相关元素 for share_element in content_tag.find_all("div", class_="share"): share_element.decompose() # 移除相关推荐 for recommend_element in content_tag.find_all("div", class_="related"): recommend_element.decompose() # 移除script标签 for script_element in content_tag.find_all("script"): script_element.decompose() # 移除样式标签 for style_element in content_tag.find_all("style"): style_element.decompose() elif "农民日报" in website.name or "farmer" in website.name: # 农民日报的文章结构处理 - 修复不保存正文和图片问题 title_tag = ( soup.find("h1", class_="title") or soup.find("h1", class_="article-title") or # 农民日报特有标题类 soup.find("h1") or soup.find("title") ) # 针对农民日报的特殊处理,如果标题出现乱码,尝试从title标签提取 if title_tag and title_tag.name == 'h1': title_text = title_tag.get_text(strip=True) if title_text and any(char in title_text for char in ['', '', '']): title_tag = soup.find("title") # 针对农民日报的特殊处理,确保标题被正确提取 if not title_tag or not title_tag.get_text(strip=True): title_tag = soup.find("title") content_tag = ( soup.find("div", class_="detailCon") or # 农民日报特有内容容器 soup.find("div", class_="article-content") or soup.find("div", class_="content") or soup.find("div", id="content") or soup.find("div", class_="text") or soup.find("div", class_="main-content") or soup.find("div", class_="article") or soup.find("div", class_="article-body") or soup.find("div", class_="article-detail") or # 农民日报文章详情容器 soup.find("div", class_="detail-content") or # 农民日报详情内容容器 soup.find("div", class_="article-text") or # 农民日报文章文本容器 soup.find("div", class_="content-text") or # 农民日报内容文本容器 soup.find("div", class_="news-content") or # 农民日报新闻内容容器 soup.find("div", class_="news-text") or # 农民日报新闻文本容器 soup.find("div", class_="news-detail") or # 农民日报新闻详情容器 soup.find("div", class_="article-main") or # 农民日报文章主体容器 soup.find("div", class_="article-container") or # 农民日报文章容器 soup.find("div", class_="content-container") or # 农民日报内容容器 soup.find("div", class_="text-container") or # 农民日报文本容器 soup.find("div", class_="main-container") # 农民日报主体容器 ) # 针对农民日报的特殊处理,如果找到多个detailCon,选择内容最长的那个 if content_tag and content_tag.get('class') and 'detailCon' in content_tag.get('class', []): detail_cons = soup.find_all("div", class_="detailCon") if len(detail_cons) > 1: # 选择内容最长的detailCon longest_content = max(detail_cons, key=lambda x: len(x.get_text(strip=True))) if len(longest_content.get_text(strip=True)) > len(content_tag.get_text(strip=True)): content_tag = longest_content # 针对农民日报的特殊处理,清理内容中的无关元素 if content_tag: # 移除编辑信息 for editor_element in content_tag.find_all("div", class_="editor"): editor_element.decompose() # 移除分享相关元素 for share_element in content_tag.find_all("div", class_="share"): share_element.decompose() # 移除相关推荐 for recommend_element in content_tag.find_all("div", class_="related"): recommend_element.decompose() # 移除script标签 for script_element in content_tag.find_all("script"): script_element.decompose() # 移除样式标签 for style_element in content_tag.find_all("style"): style_element.decompose() elif "学习强国" in website.name or "xuexi" in website.name: # 学习强国的文章结构处理 - 修复无法爬取问题 title_tag = ( soup.find("h1", class_="title") or soup.find("h1") or soup.find("title") ) # 针对学习强国的特殊处理,确保标题被正确提取 if not title_tag or not title_tag.get_text(strip=True): title_tag = soup.find("title") # 针对学习强国的特殊处理,如果标题太短,尝试从title标签提取 if title_tag: title_text = title_tag.get_text(strip=True) if title_text and len(title_text) < 10: title_tag = soup.find("title") content_tag = ( soup.find("div", class_="content") or soup.find("div", class_="article-content") or soup.find("div", id="content") or soup.find("div", class_="text") or soup.find("div", class_="main-content") or soup.find("div", class_="article") or soup.find("div", class_="article-body") or soup.find("div", class_="main") or soup.find("div", class_="detail") or soup.find("div", class_="lgpage-detail") or # 学习强国特有内容容器 soup.find("div", class_="detail-content") or # 学习强国特有内容容器 soup.find("div", class_="article-detail") or # 学习强国特有内容容器 soup.find("div", class_="xuexi") or # 学习强国特有内容容器 soup.find("div", class_="kNews") # 学习强国特有内容容器 ) elif "旗帜网" in website.name or "qizhiwang" in website.name: # 旗帜网的文章结构处理 - 修复不保存正文和图片问题 title_tag = ( soup.find("div", class_="w1200 flag-text-tit clearfix") and soup.find("div", class_="w1200 flag-text-tit clearfix").find("h1") or soup.find("h1", class_="title") or soup.find("h1", class_="article-title") or # 旗帜网特有标题类 soup.find("h1") or soup.find("title") ) # 针对旗帜网的特殊处理,如果标题为空或太短,尝试从title标签提取 if title_tag: title_text = title_tag.get_text(strip=True) if not title_text or len(title_text) < 5: title_tag = soup.find("title") # 针对旗帜网的特殊处理,确保标题被正确提取 if not title_tag or not title_tag.get_text(strip=True): title_tag = soup.find("title") # 针对旗帜网的特殊处理,如果标题太短,尝试从title标签提取 if title_tag: title_text = title_tag.get_text(strip=True) if title_text and len(title_text) < 10: title_tag = soup.find("title") content_tag = ( soup.find("div", class_="w1200 flag-text-con clearfix") or # 旗帜网特有内容容器 soup.find("div", class_="article-content") or # 旗帜网特有内容容器 soup.find("div", class_="content") or soup.find("div", id="content") or soup.find("div", class_="text") or soup.find("div", class_="main-content") or soup.find("div", class_="article") or soup.find("div", class_="article-body") ) # 针对旗帜网的特殊处理,清理内容中的无关元素 if content_tag: # 移除编辑信息 for editor_element in content_tag.find_all("div", class_="editor"): editor_element.decompose() # 移除分享相关元素 for share_element in content_tag.find_all("div", class_="share"): share_element.decompose() # 移除相关推荐 for recommend_element in content_tag.find_all("div", class_="related"): recommend_element.decompose() # 移除script标签 for script_element in content_tag.find_all("script"): script_element.decompose() # 移除样式标签 for style_element in content_tag.find_all("style"): style_element.decompose() # 针对旗帜网的特殊处理,清理内容中的无关元素 if content_tag: # 移除编辑信息 for editor_element in content_tag.find_all("div", class_="editor"): editor_element.decompose() # 移除分享相关元素 for share_element in content_tag.find_all("div", class_="share"): share_element.decompose() # 移除相关推荐 for recommend_element in content_tag.find_all("div", class_="related"): recommend_element.decompose() # 移除script标签 for script_element in content_tag.find_all("script"): script_element.decompose() # 移除样式标签 for style_element in content_tag.find_all("style"): style_element.decompose() elif "中国网" in website.name or "china.com.cn" in website.name: # 中国网的文章结构处理 - 修复不保存正文和图片问题 title_tag = ( soup.find("h1", class_="title") or soup.find("h1", class_="article-title") or # 中国网特有标题类 soup.find("h1") or soup.find("title") ) # 针对中国网的特殊处理,如果标题为空或太短,尝试从title标签提取 if title_tag: title_text = title_tag.get_text(strip=True) if not title_text or len(title_text) < 5: title_tag = soup.find("title") content_tag = ( soup.find("div", class_="article") or # 中国网特有内容容器 soup.find("div", class_="main") or soup.find("div", class_="textBox") or # 中国网直播特有内容容器 soup.find("div", class_="artInfo") or # 中国网直播特有内容容器 soup.find("div", class_="article-content") or soup.find("div", class_="content") or soup.find("div", id="content") or soup.find("div", class_="text") or soup.find("div", class_="main-content") or soup.find("div", class_="article-body") or soup.find("div", class_="news-content") or # 中国网新闻内容容器 soup.find("div", class_="news-text") or # 中国网新闻文本容器 soup.find("div", class_="news-detail") or # 中国网新闻详情容器 soup.find("div", class_="detail-content") or # 中国网详情内容容器 soup.find("div", class_="article-text") or # 中国网文章文本容器 soup.find("div", class_="content-text") # 中国网内容文本容器 ) # 针对中国网的特殊处理,清理内容中的无关元素 if content_tag: # 检查内容质量,过滤掉纯文本内容 content_text = content_tag.get_text(strip=True) if len(content_text) < 100: # 如果内容太短,可能是纯文本 print(f"中国网内容过短,可能是纯文本,跳过: {url}") return # 检查是否包含足够的HTML标签(图片、链接等) html_elements = content_tag.find_all(["img", "a", "p", "div", "span"]) if len(html_elements) < 3: # 如果HTML元素太少,可能是纯文本 print(f"中国网内容HTML元素过少,可能是纯文本,跳过: {url}") return # 移除编辑信息 for editor_element in content_tag.find_all("div", class_="editor"): editor_element.decompose() # 移除分享相关元素 for share_element in content_tag.find_all("div", class_="share"): share_element.decompose() # 移除相关推荐 for recommend_element in content_tag.find_all("div", class_="related"): recommend_element.decompose() # 移除script标签 for script_element in content_tag.find_all("script"): script_element.decompose() # 移除样式标签 for style_element in content_tag.find_all("style"): style_element.decompose() else: # 默认处理方式 title_tag = soup.find("h1") or soup.find("title") content_tag = soup.find("div", class_="content") or soup.find("div", id="content") # 最终标题处理 - 只有在没有网站特定处理时才使用默认处理 if not title_tag: title_tag = soup.find("h1") or soup.find("title") title = title_tag.get_text(strip=True) if title_tag else "无标题" # 对标题进行额外处理,去除可能的多余空白字符 title = title.strip() if title else "无标题" if not content_tag: print("没有找到正文,跳过:", url) return imgs = content_tag.find_all("img") # 查找视频元素 videos = content_tag.find_all("video") media_files = [] safe_title = "".join(c if c.isalnum() else "_" for c in title)[:50] save_dir = os.path.join(settings.MEDIA_ROOT, "articles", safe_title) os.makedirs(save_dir, exist_ok=True) for img in imgs: src = img.get("src") if not src: continue if not src.startswith("http"): src = urljoin(url, src) local_path = download_media(src, save_dir) if local_path: rel_path = os.path.relpath(local_path, settings.MEDIA_ROOT) img["src"] = settings.MEDIA_URL + rel_path.replace("\\", "/") media_files.append(rel_path.replace("\\", "/")) # 处理视频文件 for video in videos: src = video.get("src") if not src: # 检查标签 source = video.find("source") if source: src = source.get("src") # 检查data-src属性(央视网等网站常用) if not src: src = video.get("data-src") # 检查其他可能的视频源属性 if not src: src = video.get("data-url") or video.get("data-video") # 新增:检查新华网特有的视频源属性 if not src: src = video.get("data-video-src") # 新增:针对新华网的特殊处理,从复杂播放器结构中提取视频源 if not src and "新华网" in website.name: # 尝试从video标签的属性中直接获取src for attr in video.attrs: if 'src' in attr.lower(): src = video.attrs.get(attr) break # 如果还是没有找到,尝试查找父容器中的视频源信息 if not src: parent = video.parent if parent and parent.name == 'div' and 'player-container' in parent.get('class', []): # 检查是否有data-*属性包含视频信息 for attr, value in parent.attrs.items(): if 'data' in attr and isinstance(value, str) and ('.mp4' in value or 'video' in value): src = value break # 新增:查找新华网特有的视频播放器结构 if not src: # 查找包含视频信息的script标签 for script in soup.find_all("script"): if script.string and "video" in script.string.lower(): # 尝试从script中提取视频URL import re video_patterns = [ r'https?://[^\s"\']+\.(?:mp4|flv|avi|mov|wmv)', r'https?://[^\s"\']+video[^\s"\']*', r'https?://[^\s"\']+media[^\s"\']*' ] for pattern in video_patterns: matches = re.findall(pattern, script.string) if matches: src = matches[0] break if src: break # 新增:查找新华网特有的iframe视频播放器 if not src: iframe = soup.find("iframe", src=lambda x: x and ("video" in x or "player" in x)) if iframe: src = iframe.get("src") # 新增:查找新华网特有的视频播放器容器 if not src: video_container = soup.find("div", class_="video-container") or soup.find("div", class_="player-container") if video_container: # 在容器中查找视频元素 video_elem = video_container.find("video") if video_elem: src = video_elem.get("src") or video_elem.get("data-src") # 如果没有找到video标签,查找source标签 if not src: source_elem = video_container.find("source") if source_elem: src = source_elem.get("src") or source_elem.get("data-src") # 新增:查找新华网特有的视频链接 if not src: video_links = soup.find_all("a", href=lambda x: x and ("video" in x or "media" in x)) for link in video_links: href = link.get("href") if href and (".mp4" in href or ".flv" in href or "video" in href): src = href break # 新增:直接从video标签的属性中获取src(处理新华网视频) if not src and video.get("src"): src = video.get("src") # 新增:处理新华网视频,从示例代码中提取src if not src and "新华网" in website.name: # 直接从video标签中获取src属性 if video.has_attr('src'): src = video.get('src') # 检查是否有完整的属性列表 for attr in video.attrs: if isinstance(video.attrs[attr], str) and ('.mp4' in video.attrs[attr] or 'vodpub' in video.attrs[attr]): src = video.attrs[attr] break if not src: continue if not src.startswith("http"): src = urljoin(url, src) # 针对央视网等特殊处理 if "央视" in website.name or "CCTV" in website.name or "cctv" in website.name: # 央视网视频可能需要特殊处理 if "cctv.com" in src or "cntv.cn" in src: print(f"发现央视视频: {src}") # 针对新华网的特殊处理 elif "新华网" in website.name: print(f"发现新华网视频: {src}") # 针对解放军报的特殊处理 elif "解放军报" in website.name or "81.cn" in website.name: print(f"发现解放军报视频: {src}") # 解放军报视频可能需要特殊处理 if "81.cn" in src: print(f"处理解放军报视频: {src}") local_path = download_media(src, save_dir) if local_path: rel_path = os.path.relpath(local_path, settings.MEDIA_ROOT) # 更新视频src属性 if video.get("src"): video["src"] = settings.MEDIA_URL + rel_path.replace("\\", "/") else: source = video.find("source") if source: source["src"] = settings.MEDIA_URL + rel_path.replace("\\", "/") media_files.append(rel_path.replace("\\", "/")) print(f"视频下载成功: {src}") else: print(f"视频下载失败: {src}") content_html = str(content_tag) try: # 使用try-except处理可能的数据库约束错误 article = Article.objects.create( website=website, title=title, url=url, content=content_html, pub_date=timezone.now(), media_files=media_files ) print(f"已保存文章及图片:{title}") except Exception as e: # 处理重复URL或其他数据库错误 if "UNIQUE constraint failed" in str(e) and "core_article.url" in str(e): print(f"文章URL重复,跳过保存: {url}") else: print(f"保存文章时出错: {url},错误:{e}") def is_valid_url(url, base_netloc): try: parsed = urlparse(url) if parsed.scheme not in ("http", "https"): return False if parsed.netloc != base_netloc: return False return True except Exception: return False def full_site_crawler(start_url, website, max_pages=1000): headers = {"User-Agent": "Mozilla/5.0"} visited = set() queue = deque([start_url]) base_netloc = urlparse(start_url).netloc pages_crawled = 0 while queue and pages_crawled < max_pages: url = queue.popleft() if url in visited: continue print(f"正在爬取:{url}") visited.add(url) try: resp = requests.get(url, headers=headers, timeout=15) resp.raise_for_status() except Exception as e: print(f"请求失败:{url},错误:{e}") continue # 针对不同网站设置正确的编码 if "央视" in website.name or "CCTV" in website.name or "cctv" in website.name: resp.encoding = 'utf-8' elif "中国网" in website.name or "china.com.cn" in website.name: resp.encoding = 'utf-8' elif "中国新闻社" in website.name or "chinanews" in website.name: resp.encoding = 'utf-8' elif "中国政府网" in website.name or "gov.cn" in website.name: resp.encoding = 'utf-8' elif "工人日报" in website.name or "workercn" in website.name: resp.encoding = 'utf-8' elif "经济日报" in website.name or "ce.cn" in website.name: resp.encoding = 'utf-8' elif "求是" in website.name or "qstheory" in website.name: resp.encoding = 'utf-8' elif "旗帜网" in website.name or "qizhiwang" in website.name: resp.encoding = 'utf-8' elif "人民日报" in website.name or "people" in website.name: resp.encoding = 'utf-8' elif "人民政协网" in website.name or "rmzxw" in website.name: resp.encoding = 'utf-8' elif "学习时报" in website.name or "studytimes" in website.name: resp.encoding = 'utf-8' elif "中国妇女报" in website.name or "cnwomen" in website.name: resp.encoding = 'utf-8' elif "中国青年报" in website.name or "cyol" in website.name: resp.encoding = 'utf-8' else: resp.encoding = 'utf-8' soup = BeautifulSoup(resp.text, "html.parser") # 根据不同网站判断文章页面 is_article_page = False if website.name == "新华网" or website.name == "www.news.cn" or "新华网" in website.name: # 新华网的文章页面判断逻辑 - 修复不保存文章内容问题 parsed_url = urlparse(url) path = parsed_url.path is_article_page = ( soup.find("div", class_="article-content") is not None or soup.find("div", class_="content") is not None or soup.find("div", id="content") is not None or soup.find("div", class_="article") is not None or soup.find("div", class_="main-content") is not None or soup.find("span", id="detailContent") is not None or # 添加新华网特有内容容器判断 ("/news/" in path) or ("/article/" in path) or (path.startswith("/detail/") and len(path) > 10) ) elif website.name == "东方烟草报": # 对于东方烟草报,我们增加基于URL模式的判断 # 东方烟草报的文章URL通常包含/content/和日期格式 parsed_url = urlparse(url) path = parsed_url.path is_article_page = ( soup.find("div", class_="content") is not None or soup.find("div", id="gallery") is not None or soup.find("div", id="ContentText") is not None or ("/content/" in path and len(path) > 20) ) elif website.name == "www.gov.cn": # 中国政府网的文章页面判断逻辑 - 修复两个标题问题 parsed_url = urlparse(url) path = parsed_url.path is_article_page = ( soup.find("div", class_="pages_content") is not None or soup.find("div", class_="article_con") is not None or soup.find("div", class_="content") is not None or soup.find("div", id="content") is not None or soup.find("div", class_="mainBody") is not None or ("/zhengce/" in path) or ("/xinwen/" in path) or ("/huoban/" in path) ) elif "人民日报" in website.name or "人民网" in website.name: # 人民日报的文章页面判断逻辑 - 修复乱码和404问题 parsed_url = urlparse(url) path = parsed_url.path # 修改: 增加更准确的文章页面判断逻辑 is_article_page = ( (soup.find("div", class_="content") is not None and soup.find("h1") is not None) or soup.find("div", class_="article-content") is not None or (soup.find("div", id="content") is not None and soup.find("h1") is not None) or soup.find("div", class_="text") is not None or soup.find("section", class_="content") is not None or soup.find("div", class_="article") is not None or ("/article/" in path) or (path.startswith("/detail/") and len(path) > 10) or # 增加对peopleapp.com特定文章路径的判断 ("/dynamic/" in path and "article" in path) ) elif "央视" in website.name or "CCTV" in website.name or "cctv" in website.name: # 央视网站的文章页面判断逻辑 - 修复不保存正文和图片问题 parsed_url = urlparse(url) path = parsed_url.path is_article_page = ( (soup.find("div", class_="content_area") is not None and soup.find("h1") is not None) or # 央视网特有内容容器 (soup.find("div", class_="content") is not None and soup.find("h1") is not None) or soup.find("div", class_="article-content") is not None or (soup.find("div", id="content") is not None and soup.find("h1") is not None) or soup.find("div", class_="text") is not None or soup.find("div", class_="main-content") is not None or soup.find("div", class_="article") is not None or soup.find("div", class_="article-body") is not None or ("/news/" in path) or ("/article/" in path) or (path.startswith("/detail/") and len(path) > 10) or ("ARTI" in path) # 央视网文章URL特征 ) elif "求是" in website.name: # 求是网站的文章页面判断逻辑 - 修复两个标题问题 parsed_url = urlparse(url) path = parsed_url.path is_article_page = ( soup.find("div", class_="text") is not None or soup.find("div", class_="main-content") is not None or soup.find("div", class_="article") is not None or ("/article/" in path) or ("/content/" in path) or (path.startswith("/detail/") and len(path) > 10) ) elif "解放军报" in website.name or "81.cn" in website.name: # 解放军报的文章页面判断逻辑 - 修复有视频的文章不被爬取问题 parsed_url = urlparse(url) path = parsed_url.path is_article_page = ( (soup.find("div", class_="artichle-info") is not None and soup.find("title") is not None) or # 解放军报特有内容容器 (soup.find("div", class_="article-content") is not None and soup.find("h1") is not None) or (soup.find("div", class_="content") is not None and soup.find("h1") is not None) or (soup.find("div", id="content") is not None and soup.find("h1") is not None) or soup.find("div", class_="text") is not None or soup.find("div", class_="main-content") is not None or soup.find("div", class_="article") is not None or soup.find("div", class_="article-body") is not None or ("/zt/" in path) or # 解放军报专题栏目 ("/article/" in path) or ("/content/" in path) or (path.startswith("/detail/") and len(path) > 10) ) # 排除列表页面 if "/index.html" in path or path.endswith("/"): is_article_page = False elif "光明日报" in website.name or "gmw.cn" in website.name: # 光明日报的文章页面判断逻辑 - 修复不保存文章内容问题 parsed_url = urlparse(url) path = parsed_url.path is_article_page = ( (soup.find("div", class_="content") is not None and soup.find("h1") is not None) or soup.find("div", class_="article-content") is not None or (soup.find("div", id="content") is not None and soup.find("h1") is not None) or soup.find("div", class_="text") is not None or soup.find("div", class_="main-content") is not None or soup.find("div", class_="article") is not None or soup.find("div", class_="article-body") is not None or ("/article/" in path) or ("/content/" in path) or (path.startswith("/detail/") and len(path) > 10) ) elif "经济日报" in website.name or "ce.cn" in website.name: # 经济日报的文章页面判断逻辑 parsed_url = urlparse(url) path = parsed_url.path is_article_page = ( (soup.find("div", class_="content") is not None and soup.find("h1") is not None) or soup.find("div", class_="article-content") is not None or (soup.find("div", id="content") is not None and soup.find("h1") is not None) or soup.find("div", class_="text") is not None or soup.find("div", class_="main-content") is not None or soup.find("div", class_="article") is not None or ("/article/" in path) or ("/content/" in path) or (path.startswith("/detail/") and len(path) > 10) ) elif "中国日报" in website.name or "chinadaily" in website.name: # 中国日报的文章页面判断逻辑 - 修复不保存文章内容问题 parsed_url = urlparse(url) path = parsed_url.path is_article_page = ( (soup.find("div", class_="content") is not None and soup.find("h1") is not None) or soup.find("div", class_="article-content") is not None or (soup.find("div", id="content") is not None and soup.find("h1") is not None) or (soup.find("div", id="Content") is not None and # 中国日报特有内容容器 soup.find("h1") is not None) or soup.find("div", class_="text") is not None or soup.find("div", class_="main-content") is not None or soup.find("div", class_="article") is not None or soup.find("div", class_="article-body") is not None or ("/article/" in path) or ("/content/" in path) or (path.startswith("/detail/") and len(path) > 10) ) elif "工人日报" in website.name or "workercn" in website.name: # 工人日报的文章页面判断逻辑 - 修复不保存正文和图片问题 parsed_url = urlparse(url) path = parsed_url.path is_article_page = ( (soup.find("div", class_="ccontent") is not None and soup.find("h1") is not None) or # 工人日报特有内容容器 (soup.find("div", class_="article-content") is not None and soup.find("h1") is not None) or (soup.find("div", class_="content") is not None and soup.find("h1") is not None) or (soup.find("div", id="content") is not None and soup.find("h1") is not None) or soup.find("div", class_="text") is not None or soup.find("div", class_="main-content") is not None or soup.find("div", class_="article") is not None or soup.find("div", class_="article-body") is not None or ("/c/" in path) or # 工人日报文章URL特征 ("/article/" in path) or ("/content/" in path) or (path.startswith("/detail/") and len(path) > 10) ) elif "科技日报" in website.name or "stdaily" in website.name: # 科技日报的文章页面判断逻辑 - 修复无法爬取问题 parsed_url = urlparse(url) path = parsed_url.path is_article_page = ( (soup.find("div", class_="content") is not None and soup.find("h1") is not None) or soup.find("div", class_="article-content") is not None or (soup.find("div", id="content") is not None and soup.find("h1") is not None) or soup.find("div", class_="text") is not None or soup.find("div", class_="main-content") is not None or soup.find("div", class_="article") is not None or soup.find("div", class_="article-body") is not None or ("/article/" in path) or ("/content/" in path) or (path.startswith("/detail/") and len(path) > 10) ) elif "人民政协报" in website.name or "rmzxb" in website.name: # 人民政协报的文章页面判断逻辑 - 修复爬取错误问题 parsed_url = urlparse(url) path = parsed_url.path is_article_page = ( (soup.find("div", class_="content") is not None and soup.find("h1") is not None) or soup.find("div", class_="article-content") is not None or (soup.find("div", id="content") is not None and soup.find("h1") is not None) or soup.find("div", class_="text") is not None or soup.find("div", class_="main-content") is not None or soup.find("div", class_="article") is not None or soup.find("div", class_="article-body") is not None or ("/article/" in path) or ("/content/" in path) or (path.startswith("/detail/") and len(path) > 10) ) elif "中国纪检监察报" in website.name or "jjjcb" in website.name: # 中国纪检监察报的文章页面判断逻辑 - 修复无法爬取问题 parsed_url = urlparse(url) path = parsed_url.path is_article_page = ( (soup.find("div", class_="content") is not None and soup.find("h1") is not None) or soup.find("div", class_="article-content") is not None or (soup.find("div", id="content") is not None and soup.find("h1") is not None) or soup.find("div", class_="text") is not None or soup.find("div", class_="main-content") is not None or soup.find("div", class_="article") is not None or soup.find("div", class_="article-body") is not None or ("/article/" in path) or ("/content/" in path) or (path.startswith("/detail/") and len(path) > 10) ) elif "中国新闻社" in website.name or "chinanews" in website.name: # 中国新闻社的文章页面判断逻辑 - 修复爬取非文章部分问题 parsed_url = urlparse(url) path = parsed_url.path is_article_page = ( (soup.find("div", class_="content") is not None and soup.find("h1") is not None) or soup.find("div", class_="article-content") is not None or (soup.find("div", id="content") is not None and soup.find("h1") is not None) or soup.find("div", class_="text") is not None or soup.find("div", class_="main-content") is not None or soup.find("div", class_="article") is not None or soup.find("div", class_="article-body") is not None or soup.find("div", class_="left_zw") is not None or # 中国新闻网正文区域 ("/article/" in path) or ("/content/" in path) or (path.startswith("/detail/") and len(path) > 10) ) elif "学习时报" in website.name or "studytimes" in website.name: # 学习时报的文章页面判断逻辑 - 修复不保存文章内容问题 parsed_url = urlparse(url) path = parsed_url.path is_article_page = ( (soup.find("div", class_="content") is not None and soup.find("h1") is not None) or soup.find("div", class_="article-content") is not None or (soup.find("div", id="content") is not None and soup.find("h1") is not None) or soup.find("div", class_="text") is not None or soup.find("div", class_="main-content") is not None or soup.find("div", class_="article") is not None or soup.find("div", class_="article-body") is not None or soup.find("div", id="detail") is not None or # 添加学习时报特有内容容器判断 ("/article/" in path) or ("/content/" in path) or (path.startswith("/detail/") and len(path) > 10) ) elif "中国青年报" in website.name or "cyol" in website.name: # 中国青年报的文章页面判断逻辑 - 修复无法爬取问题 parsed_url = urlparse(url) path = parsed_url.path is_article_page = ( (soup.find("div", class_="content") is not None and soup.find("h1") is not None) or soup.find("div", class_="article-content") is not None or (soup.find("div", id="content") is not None and soup.find("h1") is not None) or soup.find("div", class_="text") is not None or soup.find("div", class_="main-content") is not None or soup.find("div", class_="article") is not None or soup.find("div", class_="article-body") is not None or ("/article/" in path) or ("/content/" in path) or (path.startswith("/detail/") and len(path) > 10) ) elif "中国妇女报" in website.name or "cnwomen" in website.name: # 中国妇女报的文章页面判断逻辑 - 修复不保存文章内容问题 parsed_url = urlparse(url) path = parsed_url.path is_article_page = ( (soup.find("div", class_="content") is not None and soup.find("h1") is not None) or soup.find("div", class_="article-content") is not None or (soup.find("div", id="content") is not None and soup.find("h1") is not None) or soup.find("div", class_="text") is not None or soup.find("div", class_="main-content") is not None or soup.find("div", class_="article") is not None or soup.find("div", class_="article-body") is not None or ("/article/" in path) or ("/content/" in path) or (path.startswith("/detail/") and len(path) > 10) ) elif "法治日报" in website.name or "legaldaily" in website.name: # 法治日报的文章页面判断逻辑 - 修复不保存正文和图片问题 parsed_url = urlparse(url) path = parsed_url.path is_article_page = ( (soup.find("div", class_="content-two") is not None and soup.find("h1") is not None) or # 法治日报特有内容容器 (soup.find("div", class_="article-content") is not None and soup.find("h1") is not None) or (soup.find("div", class_="content") is not None and soup.find("h1") is not None) or (soup.find("div", id="content") is not None and soup.find("h1") is not None) or soup.find("div", class_="text") is not None or soup.find("div", class_="main-content") is not None or soup.find("div", class_="article") is not None or soup.find("div", class_="article-body") is not None or ("/content/" in path and "content_" in path) or # 法治日报特有的文章URL模式 ("/article/" in path) or ("/content/" in path) or (path.startswith("/detail/") and len(path) > 10) ) elif "农民日报" in website.name or "farmer" in website.name: # 农民日报的文章页面判断逻辑 - 修复不保存正文和图片问题 parsed_url = urlparse(url) path = parsed_url.path is_article_page = ( (soup.find("div", class_="detailCon") is not None and soup.find("h1") is not None) or # 农民日报特有内容容器 (soup.find("div", class_="article-content") is not None and soup.find("h1") is not None) or (soup.find("div", class_="content") is not None and soup.find("h1") is not None) or (soup.find("div", id="content") is not None and soup.find("h1") is not None) or soup.find("div", class_="text") is not None or soup.find("div", class_="main-content") is not None or soup.find("div", class_="article") is not None or soup.find("div", class_="article-body") is not None or ("/article/" in path) or ("/content/" in path) or (path.startswith("/detail/") and len(path) > 10) ) elif "学习强国" in website.name or "xuexi" in website.name: # 学习强国的文章页面判断逻辑 - 修复无法爬取问题 parsed_url = urlparse(url) path = parsed_url.path is_article_page = ( (soup.find("div", class_="content") is not None and soup.find("h1") is not None) or soup.find("div", class_="article-content") is not None or (soup.find("div", id="content") is not None and soup.find("h1") is not None) or soup.find("div", class_="text") is not None or soup.find("div", class_="main-content") is not None or soup.find("div", class_="article") is not None or soup.find("div", class_="article-body") is not None or ("/article/" in path) or ("/content/" in path) or (path.startswith("/detail/") and len(path) > 10) ) elif "旗帜网" in website.name or "qizhiwang" in website.name: # 旗帜网的文章页面判断逻辑 - 修复不保存正文和图片问题 parsed_url = urlparse(url) path = parsed_url.path is_article_page = ( (soup.find("div", class_="w1200 flag-text-con clearfix") is not None and soup.find("h1") is not None) or # 旗帜网特有内容容器 (soup.find("div", class_="article-content") is not None and soup.find("h1") is not None) or (soup.find("div", class_="content") is not None and soup.find("h1") is not None) or (soup.find("div", id="content") is not None and soup.find("h1") is not None) or soup.find("div", class_="main-content") is not None or soup.find("div", class_="article") is not None or soup.find("div", class_="article-body") is not None or ("/n1/" in path) or # 旗帜网文章URL特征 ("/article/" in path) or ("/content/" in path) or (path.startswith("/detail/") and len(path) > 10) or # 简化判断:只要有h1标题就认为是文章页面 soup.find("h1") is not None ) elif "中国网" in website.name or "china.com.cn" in website.name: # 中国网的文章页面判断逻辑 - 修复不保存正文和图片问题 parsed_url = urlparse(url) path = parsed_url.path is_article_page = ( (soup.find("div", class_="main") is not None and soup.find("h1") is not None) or # 中国网特有内容容器 (soup.find("div", class_="article-content") is not None and soup.find("h1") is not None) or (soup.find("div", class_="content") is not None and soup.find("h1") is not None) or (soup.find("div", id="content") is not None and soup.find("h1") is not None) or soup.find("div", class_="text") is not None or soup.find("div", class_="main-content") is not None or soup.find("div", class_="article") is not None or soup.find("div", class_="article-body") is not None or ("/article/" in path) or ("/content/" in path) or ("/opinion/" in path) or # 中国网观点栏目 (path.startswith("/detail/") and len(path) > 10) ) else: # 默认判断逻辑 is_article_page = ( soup.find("div", class_="content") is not None or soup.find("div", id="content") is not None ) # 如果是文章页面,则调用文章处理 if is_article_page: process_article(url, website) pages_crawled += 1 # 扩展队列,发现新链接 for link in soup.find_all("a", href=True): href = urljoin(url, link["href"]) # 对于人民日报网站,我们扩展链接发现逻辑 if website.name == "人民日报": # 允许爬取以https://www.peopleapp.com/开头的链接 if href.startswith("https://www.peopleapp.com/") and href not in visited: # 增加对文章链接的识别 parsed_href = urlparse(href) href_path = parsed_href.path # 添加更多可能的文章链接模式 if ("/article/" in href_path or href_path.startswith("/detail/") or ("/dynamic/" in href_path and "article" in href_path) or href_path.count("/") > 2): # 更深层 queue.append(href) elif href not in visited and is_valid_url(href, base_netloc): queue.append(href)