diff --git a/core/utils.py b/core/utils.py index 3e26c6f..5fe781e 100644 --- a/core/utils.py +++ b/core/utils.py @@ -7,8 +7,77 @@ from django.utils import timezone from django.conf import settings from core.models import Article import re +import time +from selenium import webdriver +from selenium.webdriver.chrome.service import Service +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from webdriver_manager.chrome import ChromeDriverManager +def get_selenium_driver(): + """获取Selenium WebDriver实例""" + try: + chrome_options = Options() + chrome_options.add_argument("--headless") # 无头模式 + chrome_options.add_argument("--no-sandbox") + chrome_options.add_argument("--disable-dev-shm-usage") + chrome_options.add_argument("--disable-gpu") + chrome_options.add_argument("--window-size=1920,1080") + chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") + + service = Service(ChromeDriverManager().install()) + driver = webdriver.Chrome(service=service, options=chrome_options) + return driver + except Exception as e: + print(f"创建Selenium WebDriver失败: {e}") + return None + +def get_page_with_selenium(url, website_name): + """使用Selenium获取动态加载的页面内容""" + driver = None + try: + driver = get_selenium_driver() + if not driver: + return None + + print(f"使用Selenium加载页面: {url}") + driver.get(url) + + # 等待页面加载完成 + wait_time = 10 + if "学习强国" in website_name: + wait_time = 15 # 学习强国需要更长时间 + elif "法治日报" in website_name: + wait_time = 12 # 法治日报需要较长时间 + + # 等待页面主要内容加载 + try: + WebDriverWait(driver, wait_time).until( + EC.presence_of_element_located((By.TAG_NAME, "body")) + ) + except: + print(f"等待页面加载超时: {url}") + + # 额外等待时间确保动态内容加载完成 + time.sleep(3) + + # 获取页面源码 + page_source = driver.page_source + return page_source + + except Exception as e: + print(f"Selenium获取页面失败: {url}, 错误: {e}") + return None + finally: + if driver: + try: + driver.quit() + except: + pass + def download_media(url, save_dir): try: # 添加请求头以避免403 Forbidden错误 @@ -163,37 +232,68 @@ def process_article(url, website): "Referer": "http://www.qizhiwang.org.cn/" }) + # 判断是否需要使用Selenium + need_selenium = False + if any(name in website.name for name in ["学习强国", "xuexi", "法治日报", "legaldaily"]): + need_selenium = True + try: - resp = requests.get(url, headers=headers, timeout=15, allow_redirects=True) - resp.raise_for_status() - - # 检查是否是重定向页面 - if len(resp.text) < 500: - print(f"页面内容过短,可能是重定向页面:{url}") - return + if need_selenium: + # 使用Selenium获取动态加载的内容 + page_source = get_page_with_selenium(url, website.name) + if not page_source: + print(f"Selenium获取页面失败:{url}") + return + + # 检查页面内容是否过短 + min_length = 100 if "法治日报" in website.name else 300 + if len(page_source) < min_length: + print(f"页面内容过短,可能是重定向页面:{url}") + return + + # 创建BeautifulSoup对象 + soup = BeautifulSoup(page_source, "html.parser") + else: + # 使用requests获取静态内容 + resp = requests.get(url, headers=headers, timeout=15, allow_redirects=True) + resp.raise_for_status() + + # 检查是否是重定向页面 + if len(resp.text) < 300: + print(f"页面内容过短,可能是重定向页面:{url}") + return + + # 创建BeautifulSoup对象 + soup = BeautifulSoup(resp.text, "html.parser") + except Exception as e: print(f"请求失败:{url},错误:{e}") return - # 针对不同网站设置正确的编码 - if "人民网" in website.name or "人民日报" in website.name: - resp.encoding = 'utf-8' - elif "新华网" in website.name: - resp.encoding = 'utf-8' - elif "央视" in website.name or "CCTV" in website.name: - resp.encoding = 'utf-8' - elif "农民日报" in website.name or "farmer" in website.name: - resp.encoding = 'utf-8' - # 尝试其他编码 - if '' in resp.text or len(resp.text) < 1000: - resp.encoding = 'gbk' - # 进一步尝试其他编码 - if '' in resp.text or '' in resp.text: - resp.encoding = 'gb2312' - else: - resp.encoding = 'utf-8' - - soup = BeautifulSoup(resp.text, "html.parser") + # 针对不同网站设置正确的编码(仅对requests获取的内容) + if not need_selenium: + if "人民网" in website.name or "人民日报" in website.name: + resp.encoding = 'utf-8' + elif "新华网" in website.name: + resp.encoding = 'utf-8' + elif "央视" in website.name or "CCTV" in website.name: + resp.encoding = 'utf-8' + elif "农民日报" in website.name or "farmer" in website.name: + resp.encoding = 'utf-8' + # 尝试其他编码 + if '' in resp.text or len(resp.text) < 1000: + resp.encoding = 'gbk' + # 进一步尝试其他编码 + if '' in resp.text or '' in resp.text: + resp.encoding = 'gb2312' + # 如果还是有问题,尝试更多编码 + if '' in resp.text or '' in resp.text: + resp.encoding = 'utf-8-sig' + # 最后尝试 + if '' in resp.text or '' in resp.text: + resp.encoding = 'big5' + else: + resp.encoding = 'utf-8' # 处理不同网站的文章结构 if website.name == "新华网" or website.name == "www.news.cn" or "新华网" in website.name: @@ -777,6 +877,11 @@ def process_article(url, website): soup.find("h1") or soup.find("title") ) + + # 针对中国纪检监察报的特殊处理,确保标题被正确提取 + if not title_tag or not title_tag.get_text(strip=True): + title_tag = soup.find("title") + content_tag = ( soup.find("div", class_="content") or soup.find("div", class_="article-content") or @@ -784,7 +889,10 @@ def process_article(url, website): soup.find("div", class_="text") or soup.find("div", class_="main-content") or soup.find("div", class_="article") or - soup.find("div", class_="article-body") + soup.find("div", class_="article-body") or + soup.find("div", class_="main") or + soup.find("div", class_="detail") or + soup.find("div", class_="article_yt") # 中国纪检监察报特有内容容器 ) elif "中国新闻社" in website.name or "chinanews" in website.name: # 中国新闻社的文章结构处理 - 修复爬取非文章部分问题 @@ -847,7 +955,13 @@ def process_article(url, website): soup.find("h1") or soup.find("title") ) + + # 针对中国青年报的特殊处理,确保标题被正确提取 + if not title_tag or not title_tag.get_text(strip=True): + title_tag = soup.find("title") + content_tag = ( + soup.find("div", class_="main") or # 中国青年报特有内容容器 soup.find("div", class_="content") or soup.find("div", class_="article-content") or soup.find("div", id="content") or @@ -863,7 +977,14 @@ def process_article(url, website): soup.find("h1") or soup.find("title") ) + + # 针对中国妇女报的特殊处理,确保标题被正确提取 + if not title_tag or not title_tag.get_text(strip=True): + title_tag = soup.find("title") + content_tag = ( + soup.find("div", class_="main") or # 中国妇女报特有内容容器 + soup.find("div", class_="news") or # 中国妇女报特有内容容器 soup.find("div", class_="content") or soup.find("div", class_="article-content") or soup.find("div", id="content") or @@ -880,6 +1001,11 @@ def process_article(url, website): soup.find("h1") or soup.find("title") ) + + # 针对法治日报的特殊处理,确保标题被正确提取 + if not title_tag or not title_tag.get_text(strip=True): + title_tag = soup.find("title") + content_tag = ( soup.find("div", class_="content-two") or # 优先查找content-two类 soup.find("div", class_="article-content") or # 法治日报特有内容容器 @@ -888,7 +1014,10 @@ def process_article(url, website): soup.find("div", class_="text") or soup.find("div", class_="main-content") or soup.find("div", class_="article") or - soup.find("div", class_="article-body") + soup.find("div", class_="article-body") or + soup.find("div", class_="article-detail") or # 法治日报特有内容容器 + soup.find("div", class_="detail-content") or # 法治日报特有内容容器 + soup.find("div", class_="article-text") # 法治日报特有内容容器 ) # 针对法治日报的特殊处理,清理内容中的无关元素 @@ -935,6 +1064,10 @@ def process_article(url, website): title_text = title_tag.get_text(strip=True) if title_text and any(char in title_text for char in ['', '', '']): title_tag = soup.find("title") + + # 针对农民日报的特殊处理,确保标题被正确提取 + if not title_tag or not title_tag.get_text(strip=True): + title_tag = soup.find("title") content_tag = ( soup.find("div", class_="detailCon") or # 农民日报特有内容容器 soup.find("div", class_="article-content") or @@ -983,6 +1116,17 @@ def process_article(url, website): soup.find("h1") or soup.find("title") ) + + # 针对学习强国的特殊处理,确保标题被正确提取 + if not title_tag or not title_tag.get_text(strip=True): + title_tag = soup.find("title") + + # 针对学习强国的特殊处理,如果标题太短,尝试从title标签提取 + if title_tag: + title_text = title_tag.get_text(strip=True) + if title_text and len(title_text) < 10: + title_tag = soup.find("title") + content_tag = ( soup.find("div", class_="content") or soup.find("div", class_="article-content") or @@ -990,7 +1134,14 @@ def process_article(url, website): soup.find("div", class_="text") or soup.find("div", class_="main-content") or soup.find("div", class_="article") or - soup.find("div", class_="article-body") + soup.find("div", class_="article-body") or + soup.find("div", class_="main") or + soup.find("div", class_="detail") or + soup.find("div", class_="lgpage-detail") or # 学习强国特有内容容器 + soup.find("div", class_="detail-content") or # 学习强国特有内容容器 + soup.find("div", class_="article-detail") or # 学习强国特有内容容器 + soup.find("div", class_="xuexi") or # 学习强国特有内容容器 + soup.find("div", class_="kNews") # 学习强国特有内容容器 ) elif "旗帜网" in website.name or "qizhiwang" in website.name: # 旗帜网的文章结构处理 - 修复不保存正文和图片问题 @@ -1002,6 +1153,22 @@ def process_article(url, website): soup.find("h1") or soup.find("title") ) + + # 针对旗帜网的特殊处理,如果标题为空或太短,尝试从title标签提取 + if title_tag: + title_text = title_tag.get_text(strip=True) + if not title_text or len(title_text) < 5: + title_tag = soup.find("title") + + # 针对旗帜网的特殊处理,确保标题被正确提取 + if not title_tag or not title_tag.get_text(strip=True): + title_tag = soup.find("title") + + # 针对旗帜网的特殊处理,如果标题太短,尝试从title标签提取 + if title_tag: + title_text = title_tag.get_text(strip=True) + if title_text and len(title_text) < 10: + title_tag = soup.find("title") content_tag = ( soup.find("div", class_="w1200 flag-text-con clearfix") or # 旗帜网特有内容容器 soup.find("div", class_="article-content") or # 旗帜网特有内容容器 @@ -1065,14 +1232,23 @@ def process_article(url, website): soup.find("h1") or soup.find("title") ) + + # 针对中国网的特殊处理,如果标题为空或太短,尝试从title标签提取 + if title_tag: + title_text = title_tag.get_text(strip=True) + if not title_text or len(title_text) < 5: + title_tag = soup.find("title") + content_tag = ( - soup.find("div", class_="main") or # 中国网特有内容容器 + soup.find("div", class_="article") or # 中国网特有内容容器 + soup.find("div", class_="main") or + soup.find("div", class_="textBox") or # 中国网直播特有内容容器 + soup.find("div", class_="artInfo") or # 中国网直播特有内容容器 soup.find("div", class_="article-content") or soup.find("div", class_="content") or soup.find("div", id="content") or soup.find("div", class_="text") or soup.find("div", class_="main-content") or - soup.find("div", class_="article") or soup.find("div", class_="article-body") )