diff --git a/core/utils.py b/core/utils.py index 940ac7a..3e26c6f 100644 --- a/core/utils.py +++ b/core/utils.py @@ -92,10 +92,85 @@ def process_article(url, website): "Upgrade-Insecure-Requests": "1", "Referer": "https://www.gmw.cn/" }) + # 添加央视网的特殊请求头 + elif "央视" in website.name or "CCTV" in website.name or "cctv" in website.name: + headers.update({ + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", + "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", + "Accept-Encoding": "gzip, deflate", + "Connection": "keep-alive", + "Upgrade-Insecure-Requests": "1", + "Referer": "https://news.cctv.com/" + }) + # 添加中国网的特殊请求头 + elif "中国网" in website.name or "china.com.cn" in website.name: + headers.update({ + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", + "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", + "Accept-Encoding": "gzip, deflate", + "Connection": "keep-alive", + "Upgrade-Insecure-Requests": "1", + "Referer": "http://www.china.com.cn/" + }) + # 添加法治日报的特殊请求头 + elif "法治日报" in website.name or "legaldaily" in website.name: + headers.update({ + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", + "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", + "Accept-Encoding": "gzip, deflate", + "Connection": "keep-alive", + "Upgrade-Insecure-Requests": "1", + "Referer": "http://www.legaldaily.com.cn/" + }) + # 添加工人日报的特殊请求头 + elif "工人日报" in website.name or "workercn" in website.name: + headers.update({ + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", + "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", + "Accept-Encoding": "gzip, deflate", + "Connection": "keep-alive", + "Upgrade-Insecure-Requests": "1", + "Referer": "https://www.workercn.cn/" + }) + # 添加农民日报的特殊请求头 + elif "农民日报" in website.name or "farmer" in website.name: + headers.update({ + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", + "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", + "Accept-Encoding": "gzip, deflate", + "Connection": "keep-alive", + "Upgrade-Insecure-Requests": "1", + "Referer": "https://www.farmer.com.cn/" + }) + # 添加解放军报的特殊请求头 + elif "解放军报" in website.name or "81.cn" in website.name: + headers.update({ + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", + "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", + "Accept-Encoding": "gzip, deflate", + "Connection": "keep-alive", + "Upgrade-Insecure-Requests": "1", + "Referer": "http://www.81.cn/" + }) + # 添加旗帜网的特殊请求头 + elif "旗帜网" in website.name or "qizhiwang" in website.name: + headers.update({ + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", + "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", + "Accept-Encoding": "gzip, deflate", + "Connection": "keep-alive", + "Upgrade-Insecure-Requests": "1", + "Referer": "http://www.qizhiwang.org.cn/" + }) try: - resp = requests.get(url, headers=headers, timeout=15) + resp = requests.get(url, headers=headers, timeout=15, allow_redirects=True) resp.raise_for_status() + + # 检查是否是重定向页面 + if len(resp.text) < 500: + print(f"页面内容过短,可能是重定向页面:{url}") + return except Exception as e: print(f"请求失败:{url},错误:{e}") return @@ -107,6 +182,14 @@ def process_article(url, website): resp.encoding = 'utf-8' elif "央视" in website.name or "CCTV" in website.name: resp.encoding = 'utf-8' + elif "农民日报" in website.name or "farmer" in website.name: + resp.encoding = 'utf-8' + # 尝试其他编码 + if '' in resp.text or len(resp.text) < 1000: + resp.encoding = 'gbk' + # 进一步尝试其他编码 + if '' in resp.text or '' in resp.text: + resp.encoding = 'gb2312' else: resp.encoding = 'utf-8' @@ -146,7 +229,7 @@ def process_article(url, website): if not content_tag: content_tag = soup.find("div", id="ContentText") elif website.name == "www.gov.cn": - # 中国政府网的文章结构处理 - 修复两个标题问题 + # 中国政府网的文章结构处理 - 修复标题重复问题 title_tag = ( soup.find("h1", class_="title") or soup.find("h1") or @@ -160,6 +243,38 @@ def process_article(url, website): soup.find("div", id="content") or soup.find("div", class_="mainBody") ) + + # 针对中国政府网的特殊处理,清理内容中的重复标题 + if content_tag and title_tag: + title_text = title_tag.get_text(strip=True) + if title_text: + # 移除内容中的重复标题元素 + for heading in content_tag.find_all(["h1", "h2", "h3"]): + heading_text = heading.get_text(strip=True) + if title_text in heading_text or heading_text in title_text: + heading.decompose() + + # 移除class包含title的元素 + for title_element in content_tag.find_all(class_=lambda x: x and "title" in x): + title_element_text = title_element.get_text(strip=True) + if title_text in title_element_text or title_element_text in title_text: + title_element.decompose() + + # 移除编辑信息 + for editor_element in content_tag.find_all("div", class_="editor"): + editor_element.decompose() + + # 移除分享相关元素 + for share_element in content_tag.find_all("div", class_="share"): + share_element.decompose() + + # 移除script标签 + for script_element in content_tag.find_all("script"): + script_element.decompose() + + # 移除样式标签 + for style_element in content_tag.find_all("style"): + style_element.decompose() elif "人民日报" in website.name or "人民网" in website.name: # 人民日报网站的文章结构处理 - 修复乱码和404问题 title_tag = ( @@ -220,21 +335,50 @@ def process_article(url, website): # 移除相关专题列表 for topic_element in content_tag.find_all("div", class_="clearfix text_like"): topic_element.decompose() - elif "央视" in website.name or "CCTV" in website.name: - # 央视网站的文章结构处理 - 修复视频下载问题 + elif "央视" in website.name or "CCTV" in website.name or "cctv" in website.name: + # 央视网站的文章结构处理 - 修复不保存正文和图片问题 title_tag = ( soup.find("h1", class_="title") or + soup.find("h1", class_="title_text") or # 央视网特有标题类 soup.find("h1") or soup.find("title") ) content_tag = ( + soup.find("div", class_="content_area") or # 央视网特有内容容器 soup.find("div", class_="content") or soup.find("div", class_="article-content") or soup.find("div", id="content") or soup.find("div", class_="text") or soup.find("div", class_="main-content") or - soup.find("div", class_="article") + soup.find("div", class_="article") or + soup.find("div", class_="article-body") ) + + # 针对央视网的特殊处理,清理内容中的无关元素 + if content_tag: + # 移除编辑信息 + for editor_element in content_tag.find_all("div", class_="editor"): + editor_element.decompose() + + # 移除分享相关元素 + for share_element in content_tag.find_all("div", class_="share"): + share_element.decompose() + + # 移除相关推荐 + for recommend_element in content_tag.find_all("div", class_="related"): + recommend_element.decompose() + + # 移除评论区域 + for comment_element in content_tag.find_all("div", class_="comment"): + comment_element.decompose() + + # 移除script标签 + for script_element in content_tag.find_all("script"): + script_element.decompose() + + # 移除样式标签 + for style_element in content_tag.find_all("style"): + style_element.decompose() elif "求是" in website.name: # 求是网站的文章结构处理 - 修复标题和正文清理问题 title_tag = ( @@ -245,6 +389,16 @@ def process_article(url, website): soup.find("p", class_="title") or soup.find("title") ) + + # 针对求是的特殊处理,如果标题为空或太短,尝试从title标签提取 + if title_tag: + title_text = title_tag.get_text(strip=True) + if not title_text or len(title_text) < 5: + title_tag = soup.find("title") + + # 针对求是的特殊处理,确保标题被正确提取 + if not title_tag or not title_tag.get_text(strip=True): + title_tag = soup.find("title") content_tag = ( soup.find("div", class_="content") or @@ -260,13 +414,30 @@ def process_article(url, website): if title_tag: title_text = title_tag.get_text(strip=True) if title_text: + # 移除所有可能的重复标题元素 for strong_tag in content_tag.find_all("strong"): strong_text = strong_tag.get_text(strip=True) - if title_text in strong_text or strong_text in title_text: + if strong_text and (title_text in strong_text or strong_text in title_text): parent_p = strong_tag.find_parent("p") # 如果 strong 在正文前两段内,就删除 if parent_p in content_tag.find_all("p")[:2]: strong_tag.decompose() + + # 移除h1、h2、h3标题元素中的重复标题 + for heading in content_tag.find_all(["h1", "h2", "h3"]): + heading_text = heading.get_text(strip=True) + if heading_text and (title_text in heading_text or heading_text in title_text): + # 确保不删除title_tag本身 + if heading != title_tag: + heading.decompose() + + # 移除class包含title的元素 + for title_element in content_tag.find_all(class_=lambda x: x and "title" in x): + title_element_text = title_element.get_text(strip=True) + if title_element_text and (title_text in title_element_text or title_element_text in title_text): + # 确保不删除title_tag本身 + if title_element != title_tag: + title_element.decompose() # 移除分享相关元素 for share_element in content_tag.find_all("div", class_="sharebox"): @@ -304,20 +475,84 @@ def process_article(url, website): for highlight_element in content_tag.find_all("div", class_="highlight"): highlight_element.unwrap() elif "解放军报" in website.name or "81.cn" in website.name: - # 解放军报的文章结构处理 - 修复类别爬取问题 + # 解放军报的文章结构处理 - 修复有视频的文章不被爬取问题 title_tag = ( soup.find("h1", class_="title") or + soup.find("h1", class_="article-title") or # 解放军报特有标题类 soup.find("h1") or + soup.find("h2") or # 解放军报使用h2标签作为标题 soup.find("title") ) + + # 针对解放军报的特殊处理,如果标题为空或太短,尝试从title标签提取 + if title_tag: + title_text = title_tag.get_text(strip=True) + if not title_text or len(title_text) < 5: + title_tag = soup.find("title") content_tag = ( - soup.find("div", class_="content") or - soup.find("div", class_="article-content") or - soup.find("div", id="content") or + soup.find("div", id="article-content") or # 解放军报实际文章内容容器 soup.find("div", class_="text") or + soup.find("div", class_="article-content") or + soup.find("div", class_="content") or + soup.find("div", id="content") or soup.find("div", class_="main-content") or - soup.find("div", class_="article") + soup.find("div", class_="article") or + soup.find("div", class_="article-body") or + soup.find("div", class_="artichle-info") # 作为备选 ) + + # 针对解放军报的特殊处理,清理内容中的无关元素 + if content_tag: + # 移除面包屑导航 + for breadcrumb in content_tag.find_all("ol", class_="breadcrumb"): + breadcrumb.decompose() + + # 移除分享相关元素 + for share_element in content_tag.find_all("div", class_="share-custom"): + share_element.decompose() + + # 移除作者信息段落 + for author_p in content_tag.find_all("p"): + text = author_p.get_text(strip=True) + if "来源:" in text or "作者:" in text or "责任编辑:" in text or "发布:" in text: + author_p.decompose() + + # 移除进度条 + for progress in content_tag.find_all("div", class_="progress-bar"): + progress.decompose() + + # 移除播放器 + for player in content_tag.find_all("div", class_="player"): + player.decompose() + + # 移除媒体URL容器 + for media in content_tag.find_all("div", id="mediaurl"): + media.decompose() + + # 移除新闻列表(但保留其中的内容) + for news_list in content_tag.find_all("ul", id="main-news-list"): + # 不删除整个ul,而是unwrap它,保留其中的内容 + news_list.unwrap() + + # 移除编辑信息 + for editor_element in content_tag.find_all("div", class_="editor"): + editor_element.decompose() + + # 移除分享相关元素 + for share_element in content_tag.find_all("div", class_="share"): + share_element.decompose() + + # 移除相关推荐 + for recommend_element in content_tag.find_all("div", class_="related"): + recommend_element.decompose() + + # 移除script标签 + for script_element in content_tag.find_all("script"): + script_element.decompose() + + # 移除样式标签 + for style_element in content_tag.find_all("style"): + style_element.decompose() elif "光明日报" in website.name or "gmw.cn" in website.name: # 光明日报的文章结构处理 - 修复不保存文章内容问题 title_tag = ( @@ -402,21 +637,57 @@ def process_article(url, website): # 更新content_tag为包含所有分页内容 content_tag = BeautifulSoup(all_content_html, "html.parser") elif "工人日报" in website.name or "workercn" in website.name: - # 工人日报的文章结构处理 - 修复不保存文章内容问题 + # 工人日报的文章结构处理 - 修复不保存正文和图片问题 title_tag = ( soup.find("h1", class_="title") or + soup.find("h1", class_="article-title") or # 工人日报特有标题类 soup.find("h1") or soup.find("title") ) + + # 针对工人日报的特殊处理,如果标题为空或太短,尝试从title标签提取 + if title_tag: + title_text = title_tag.get_text(strip=True) + if not title_text or len(title_text) < 5: + title_tag = soup.find("title") + + # 进一步处理:如果h1标题包含太多无关信息,尝试从title标签提取更简洁的标题 + if title_tag and title_tag.name == 'h1': + title_text = title_tag.get_text(strip=True) + if title_text and len(title_text) > 50: # 如果h1标题太长 + title_tag = soup.find("title") content_tag = ( - soup.find("div", class_="content") or + soup.find("div", class_="ccontent") or # 工人日报特有内容容器 soup.find("div", class_="article-content") or + soup.find("div", class_="content") or soup.find("div", id="content") or soup.find("div", class_="text") or soup.find("div", class_="main-content") or soup.find("div", class_="article") or soup.find("div", class_="article-body") ) + + # 针对工人日报的特殊处理,清理内容中的无关元素 + if content_tag: + # 移除编辑信息 + for editor_element in content_tag.find_all("div", class_="editor"): + editor_element.decompose() + + # 移除分享相关元素 + for share_element in content_tag.find_all("div", class_="share"): + share_element.decompose() + + # 移除相关推荐 + for recommend_element in content_tag.find_all("div", class_="related"): + recommend_element.decompose() + + # 移除script标签 + for script_element in content_tag.find_all("script"): + script_element.decompose() + + # 移除样式标签 + for style_element in content_tag.find_all("style"): + style_element.decompose() elif "科技日报" in website.name or "stdaily" in website.name: # 科技日报的文章结构处理 - 修复无法爬取问题 title_tag = ( @@ -602,16 +873,17 @@ def process_article(url, website): soup.find("div", class_="article-body") ) elif "法治日报" in website.name or "legaldaily" in website.name: - # 法治日报的文章结构处理 - 修复无法爬取问题 + # 法治日报的文章结构处理 - 修复不保存正文和图片问题 title_tag = ( soup.find("h1", class_="title") or + soup.find("h1", class_="article-title") or # 法治日报特有标题类 soup.find("h1") or soup.find("title") ) content_tag = ( soup.find("div", class_="content-two") or # 优先查找content-two类 + soup.find("div", class_="article-content") or # 法治日报特有内容容器 soup.find("div", class_="content") or - soup.find("div", class_="article-content") or soup.find("div", id="content") or soup.find("div", class_="text") or soup.find("div", class_="main-content") or @@ -619,30 +891,91 @@ def process_article(url, website): soup.find("div", class_="article-body") ) - # 如果找到content-two,需要进一步处理去除内部的标题元素(避免重复) - if content_tag and content_tag.get('class') and 'content-two' in content_tag.get('class', []): - # 查找并移除内容中的标题元素(避免重复) - inner_titles = content_tag.find_all(['h1', 'h2']) - title_text = title_tag.get_text(strip=True) if title_tag else "无标题" - for inner_title in inner_titles: - if inner_title.get_text().strip() == title_text: - inner_title.decompose() + # 针对法治日报的特殊处理,清理内容中的无关元素 + if content_tag: + # 如果找到content-two,需要进一步处理去除内部的标题元素(避免重复) + if content_tag.get('class') and 'content-two' in content_tag.get('class', []): + # 查找并移除内容中的标题元素(避免重复) + inner_titles = content_tag.find_all(['h1', 'h2']) + title_text = title_tag.get_text(strip=True) if title_tag else "无标题" + for inner_title in inner_titles: + if inner_title.get_text().strip() == title_text: + inner_title.decompose() + + # 移除编辑信息 + for editor_element in content_tag.find_all("div", class_="editor"): + editor_element.decompose() + + # 移除分享相关元素 + for share_element in content_tag.find_all("div", class_="share"): + share_element.decompose() + + # 移除相关推荐 + for recommend_element in content_tag.find_all("div", class_="related"): + recommend_element.decompose() + + # 移除script标签 + for script_element in content_tag.find_all("script"): + script_element.decompose() + + # 移除样式标签 + for style_element in content_tag.find_all("style"): + style_element.decompose() elif "农民日报" in website.name or "farmer" in website.name: - # 农民日报的文章结构处理 - 修复正文未被爬取问题 + # 农民日报的文章结构处理 - 修复不保存正文和图片问题 title_tag = ( soup.find("h1", class_="title") or + soup.find("h1", class_="article-title") or # 农民日报特有标题类 soup.find("h1") or soup.find("title") ) + + # 针对农民日报的特殊处理,如果标题出现乱码,尝试从title标签提取 + if title_tag and title_tag.name == 'h1': + title_text = title_tag.get_text(strip=True) + if title_text and any(char in title_text for char in ['', '', '']): + title_tag = soup.find("title") content_tag = ( - soup.find("div", class_="content") or + soup.find("div", class_="detailCon") or # 农民日报特有内容容器 soup.find("div", class_="article-content") or + soup.find("div", class_="content") or soup.find("div", id="content") or soup.find("div", class_="text") or soup.find("div", class_="main-content") or soup.find("div", class_="article") or soup.find("div", class_="article-body") ) + + # 针对农民日报的特殊处理,如果找到多个detailCon,选择内容最长的那个 + if content_tag and content_tag.get('class') and 'detailCon' in content_tag.get('class', []): + detail_cons = soup.find_all("div", class_="detailCon") + if len(detail_cons) > 1: + # 选择内容最长的detailCon + longest_content = max(detail_cons, key=lambda x: len(x.get_text(strip=True))) + if len(longest_content.get_text(strip=True)) > len(content_tag.get_text(strip=True)): + content_tag = longest_content + + # 针对农民日报的特殊处理,清理内容中的无关元素 + if content_tag: + # 移除编辑信息 + for editor_element in content_tag.find_all("div", class_="editor"): + editor_element.decompose() + + # 移除分享相关元素 + for share_element in content_tag.find_all("div", class_="share"): + share_element.decompose() + + # 移除相关推荐 + for recommend_element in content_tag.find_all("div", class_="related"): + recommend_element.decompose() + + # 移除script标签 + for script_element in content_tag.find_all("script"): + script_element.decompose() + + # 移除样式标签 + for style_element in content_tag.find_all("style"): + style_element.decompose() elif "学习强国" in website.name or "xuexi" in website.name: # 学习强国的文章结构处理 - 修复无法爬取问题 title_tag = ( @@ -660,18 +993,19 @@ def process_article(url, website): soup.find("div", class_="article-body") ) elif "旗帜网" in website.name or "qizhiwang" in website.name: - # 旗帜网的文章结构处理 - 修复不保存文章内容问题 + # 旗帜网的文章结构处理 - 修复不保存正文和图片问题 title_tag = ( soup.find("div", class_="w1200 flag-text-tit clearfix") and soup.find("div", class_="w1200 flag-text-tit clearfix").find("h1") or soup.find("h1", class_="title") or + soup.find("h1", class_="article-title") or # 旗帜网特有标题类 soup.find("h1") or soup.find("title") ) content_tag = ( soup.find("div", class_="w1200 flag-text-con clearfix") or # 旗帜网特有内容容器 + soup.find("div", class_="article-content") or # 旗帜网特有内容容器 soup.find("div", class_="content") or - soup.find("div", class_="article-content") or soup.find("div", id="content") or soup.find("div", class_="text") or soup.find("div", class_="main-content") or @@ -679,27 +1013,99 @@ def process_article(url, website): soup.find("div", class_="article-body") ) + # 针对旗帜网的特殊处理,清理内容中的无关元素 + if content_tag: + # 移除编辑信息 + for editor_element in content_tag.find_all("div", class_="editor"): + editor_element.decompose() + + # 移除分享相关元素 + for share_element in content_tag.find_all("div", class_="share"): + share_element.decompose() + + # 移除相关推荐 + for recommend_element in content_tag.find_all("div", class_="related"): + recommend_element.decompose() + + # 移除script标签 + for script_element in content_tag.find_all("script"): + script_element.decompose() + + # 移除样式标签 + for style_element in content_tag.find_all("style"): + style_element.decompose() + + # 针对旗帜网的特殊处理,清理内容中的无关元素 + if content_tag: + # 移除编辑信息 + for editor_element in content_tag.find_all("div", class_="editor"): + editor_element.decompose() + + # 移除分享相关元素 + for share_element in content_tag.find_all("div", class_="share"): + share_element.decompose() + + # 移除相关推荐 + for recommend_element in content_tag.find_all("div", class_="related"): + recommend_element.decompose() + + # 移除script标签 + for script_element in content_tag.find_all("script"): + script_element.decompose() + + # 移除样式标签 + for style_element in content_tag.find_all("style"): + style_element.decompose() + elif "中国网" in website.name or "china.com.cn" in website.name: - # 中国网的文章结构处理 - 修复不保存文章内容问题 + # 中国网的文章结构处理 - 修复不保存正文和图片问题 title_tag = ( soup.find("h1", class_="title") or + soup.find("h1", class_="article-title") or # 中国网特有标题类 soup.find("h1") or soup.find("title") ) content_tag = ( - soup.find("div", class_="content") or + soup.find("div", class_="main") or # 中国网特有内容容器 soup.find("div", class_="article-content") or + soup.find("div", class_="content") or soup.find("div", id="content") or soup.find("div", class_="text") or soup.find("div", class_="main-content") or soup.find("div", class_="article") or soup.find("div", class_="article-body") ) + + # 针对中国网的特殊处理,清理内容中的无关元素 + if content_tag: + # 移除编辑信息 + for editor_element in content_tag.find_all("div", class_="editor"): + editor_element.decompose() + + # 移除分享相关元素 + for share_element in content_tag.find_all("div", class_="share"): + share_element.decompose() + + # 移除相关推荐 + for recommend_element in content_tag.find_all("div", class_="related"): + recommend_element.decompose() + + # 移除script标签 + for script_element in content_tag.find_all("script"): + script_element.decompose() + + # 移除样式标签 + for style_element in content_tag.find_all("style"): + style_element.decompose() else: # 默认处理方式 title_tag = soup.find("h1") or soup.find("title") content_tag = soup.find("div", class_="content") or soup.find("div", id="content") + # 最终标题处理 - 只有在没有网站特定处理时才使用默认处理 + if not title_tag: + title_tag = soup.find("h1") or soup.find("title") + title = title_tag.get_text(strip=True) if title_tag else "无标题" # 对标题进行额外处理,去除可能的多余空白字符 @@ -776,7 +1182,7 @@ def process_article(url, website): src = urljoin(url, src) # 针对央视网等特殊处理 - if "央视" in website.name or "CCTV" in website.name: + if "央视" in website.name or "CCTV" in website.name or "cctv" in website.name: # 央视网视频可能需要特殊处理 if "cctv.com" in src or "cntv.cn" in src: print(f"发现央视视频: {src}") @@ -785,6 +1191,13 @@ def process_article(url, website): elif "新华网" in website.name: print(f"发现新华网视频: {src}") + # 针对解放军报的特殊处理 + elif "解放军报" in website.name or "81.cn" in website.name: + print(f"发现解放军报视频: {src}") + # 解放军报视频可能需要特殊处理 + if "81.cn" in src: + print(f"处理解放军报视频: {src}") + local_path = download_media(src, save_dir) if local_path: rel_path = os.path.relpath(local_path, settings.MEDIA_ROOT) @@ -920,11 +1333,13 @@ def full_site_crawler(start_url, website, max_pages=1000): # 增加对peopleapp.com特定文章路径的判断 ("/dynamic/" in path and "article" in path) ) - elif "央视" in website.name or "CCTV" in website.name: - # 央视网站的文章页面判断逻辑 - 修复视频下载问题 + elif "央视" in website.name or "CCTV" in website.name or "cctv" in website.name: + # 央视网站的文章页面判断逻辑 - 修复不保存正文和图片问题 parsed_url = urlparse(url) path = parsed_url.path is_article_page = ( + (soup.find("div", class_="content_area") is not None and + soup.find("h1") is not None) or # 央视网特有内容容器 (soup.find("div", class_="content") is not None and soup.find("h1") is not None) or soup.find("div", class_="article-content") is not None or @@ -933,9 +1348,11 @@ def full_site_crawler(start_url, website, max_pages=1000): soup.find("div", class_="text") is not None or soup.find("div", class_="main-content") is not None or soup.find("div", class_="article") is not None or + soup.find("div", class_="article-body") is not None or ("/news/" in path) or ("/article/" in path) or - (path.startswith("/detail/") and len(path) > 10) + (path.startswith("/detail/") and len(path) > 10) or + ("ARTI" in path) # 央视网文章URL特征 ) elif "求是" in website.name: # 求是网站的文章页面判断逻辑 - 修复两个标题问题 @@ -950,22 +1367,31 @@ def full_site_crawler(start_url, website, max_pages=1000): (path.startswith("/detail/") and len(path) > 10) ) elif "解放军报" in website.name or "81.cn" in website.name: - # 解放军报的文章页面判断逻辑 - 修复类别爬取问题 + # 解放军报的文章页面判断逻辑 - 修复有视频的文章不被爬取问题 parsed_url = urlparse(url) path = parsed_url.path is_article_page = ( + (soup.find("div", class_="artichle-info") is not None and + soup.find("title") is not None) or # 解放军报特有内容容器 + (soup.find("div", class_="article-content") is not None and + soup.find("h1") is not None) or (soup.find("div", class_="content") is not None and soup.find("h1") is not None) or - soup.find("div", class_="article-content") is not None or (soup.find("div", id="content") is not None and soup.find("h1") is not None) or soup.find("div", class_="text") is not None or soup.find("div", class_="main-content") is not None or soup.find("div", class_="article") is not None or + soup.find("div", class_="article-body") is not None or + ("/zt/" in path) or # 解放军报专题栏目 ("/article/" in path) or ("/content/" in path) or (path.startswith("/detail/") and len(path) > 10) ) + + # 排除列表页面 + if "/index.html" in path or path.endswith("/"): + is_article_page = False elif "光明日报" in website.name or "gmw.cn" in website.name: # 光明日报的文章页面判断逻辑 - 修复不保存文章内容问题 parsed_url = urlparse(url) @@ -1022,19 +1448,23 @@ def full_site_crawler(start_url, website, max_pages=1000): (path.startswith("/detail/") and len(path) > 10) ) elif "工人日报" in website.name or "workercn" in website.name: - # 工人日报的文章页面判断逻辑 - 修复不保存文章内容问题 + # 工人日报的文章页面判断逻辑 - 修复不保存正文和图片问题 parsed_url = urlparse(url) path = parsed_url.path is_article_page = ( + (soup.find("div", class_="ccontent") is not None and + soup.find("h1") is not None) or # 工人日报特有内容容器 + (soup.find("div", class_="article-content") is not None and + soup.find("h1") is not None) or (soup.find("div", class_="content") is not None and soup.find("h1") is not None) or - soup.find("div", class_="article-content") is not None or (soup.find("div", id="content") is not None and soup.find("h1") is not None) or soup.find("div", class_="text") is not None or soup.find("div", class_="main-content") is not None or soup.find("div", class_="article") is not None or soup.find("div", class_="article-body") is not None or + ("/c/" in path) or # 工人日报文章URL特征 ("/article/" in path) or ("/content/" in path) or (path.startswith("/detail/") and len(path) > 10) @@ -1168,13 +1598,16 @@ def full_site_crawler(start_url, website, max_pages=1000): (path.startswith("/detail/") and len(path) > 10) ) elif "法治日报" in website.name or "legaldaily" in website.name: - # 法治日报的文章页面判断逻辑 - 修复无法爬取问题 + # 法治日报的文章页面判断逻辑 - 修复不保存正文和图片问题 parsed_url = urlparse(url) path = parsed_url.path is_article_page = ( + (soup.find("div", class_="content-two") is not None and + soup.find("h1") is not None) or # 法治日报特有内容容器 + (soup.find("div", class_="article-content") is not None and + soup.find("h1") is not None) or (soup.find("div", class_="content") is not None and soup.find("h1") is not None) or - soup.find("div", class_="article-content") is not None or (soup.find("div", id="content") is not None and soup.find("h1") is not None) or soup.find("div", class_="text") is not None or @@ -1187,13 +1620,16 @@ def full_site_crawler(start_url, website, max_pages=1000): (path.startswith("/detail/") and len(path) > 10) ) elif "农民日报" in website.name or "farmer" in website.name: - # 农民日报的文章页面判断逻辑 - 修复正文未被爬取问题 + # 农民日报的文章页面判断逻辑 - 修复不保存正文和图片问题 parsed_url = urlparse(url) path = parsed_url.path is_article_page = ( + (soup.find("div", class_="detailCon") is not None and + soup.find("h1") is not None) or # 农民日报特有内容容器 + (soup.find("div", class_="article-content") is not None and + soup.find("h1") is not None) or (soup.find("div", class_="content") is not None and soup.find("h1") is not None) or - soup.find("div", class_="article-content") is not None or (soup.find("div", id="content") is not None and soup.find("h1") is not None) or soup.find("div", class_="text") is not None or @@ -1223,30 +1659,39 @@ def full_site_crawler(start_url, website, max_pages=1000): (path.startswith("/detail/") and len(path) > 10) ) elif "旗帜网" in website.name or "qizhiwang" in website.name: - # 旗帜网的文章页面判断逻辑 - 修复不保存文章内容问题 + # 旗帜网的文章页面判断逻辑 - 修复不保存正文和图片问题 parsed_url = urlparse(url) path = parsed_url.path is_article_page = ( + (soup.find("div", class_="w1200 flag-text-con clearfix") is not None and + soup.find("h1") is not None) or # 旗帜网特有内容容器 + (soup.find("div", class_="article-content") is not None and + soup.find("h1") is not None) or (soup.find("div", class_="content") is not None and soup.find("h1") is not None) or - soup.find("div", class_="article-content") is not None or (soup.find("div", id="content") is not None and soup.find("h1") is not None) or soup.find("div", class_="main-content") is not None or soup.find("div", class_="article") is not None or soup.find("div", class_="article-body") is not None or + ("/n1/" in path) or # 旗帜网文章URL特征 ("/article/" in path) or ("/content/" in path) or - (path.startswith("/detail/") and len(path) > 10) + (path.startswith("/detail/") and len(path) > 10) or + # 简化判断:只要有h1标题就认为是文章页面 + soup.find("h1") is not None ) elif "中国网" in website.name or "china.com.cn" in website.name: - # 中国网的文章页面判断逻辑 - 修复不保存文章内容问题 + # 中国网的文章页面判断逻辑 - 修复不保存正文和图片问题 parsed_url = urlparse(url) path = parsed_url.path is_article_page = ( + (soup.find("div", class_="main") is not None and + soup.find("h1") is not None) or # 中国网特有内容容器 + (soup.find("div", class_="article-content") is not None and + soup.find("h1") is not None) or (soup.find("div", class_="content") is not None and soup.find("h1") is not None) or - soup.find("div", class_="article-content") is not None or (soup.find("div", id="content") is not None and soup.find("h1") is not None) or soup.find("div", class_="text") is not None or @@ -1255,6 +1700,7 @@ def full_site_crawler(start_url, website, max_pages=1000): soup.find("div", class_="article-body") is not None or ("/article/" in path) or ("/content/" in path) or + ("/opinion/" in path) or # 中国网观点栏目 (path.startswith("/detail/") and len(path) > 10) ) else: