Support keword

2025-09-24 03:38:32 +08:00
parent a4891b1c30
commit 8592833d74
16 changed files with 2888 additions and 2 deletions
--- a/core/utils.py
+++ b/core/utils.py
@@ -1,7 +1,7 @@
 import os
 import requests
 from bs4 import BeautifulSoup
-from urllib.parse import urljoin, urlparse
+from urllib.parse import urljoin, urlparse, quote
 from collections import deque
 from django.utils import timezone
 from django.conf import settings
@@ -15,6 +15,8 @@ from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from webdriver_manager.chrome import ChromeDriverManager
+from datetime import datetime, timedelta
+import json


 def get_selenium_driver():
@@ -2270,3 +2272,695 @@ def full_site_crawler(start_url, website, max_pages=1000):
                        queue.append(href)
            elif href not in visited and is_valid_url(href, base_netloc):
                queue.append(href)
+
+
+# 网站搜索配置
+WEBSITE_SEARCH_CONFIGS = {
+    "新华网": {
+        "search_url": "http://so.news.cn/getNews",
+        "search_params": {
+            "keyword": "{keyword}",
+            "curPage": "{page}",
+            "sortField": "0",
+            "sortType": "1"
+        },
+        "method": "post",
+        "headers": {
+            "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
+        }
+    },
+    "人民日报": {
+        "search_url": "http://search.people.com.cn/search",
+        "search_params": {
+            "keyword": "{keyword}",
+            "st": "0",
+            "startDate": "{start_date}",
+            "endDate": "{end_date}",
+            "page": "{page}"
+        },
+        "method": "get"
+    },
+    "央视网": {
+        "search_url": "https://search.cctv.com/search.php",
+        "search_params": {
+            "qtext": "{keyword}",
+            "type": "web",
+            "page": "{page}"
+        },
+        "method": "get"
+    },
+    "光明日报": {
+        "search_url": "http://search.gmw.cn/search",
+        "search_params": {
+            "q": "{keyword}",
+            "page": "{page}"
+        },
+        "method": "get"
+    },
+    "经济日报": {
+        "search_url": "http://www.ce.cn/search",
+        "search_params": {
+            "keyword": "{keyword}",
+            "page": "{page}"
+        },
+        "method": "get"
+    },
+    "中国日报": {
+        "search_url": "http://www.chinadaily.com.cn/search",
+        "search_params": {
+            "q": "{keyword}",
+            "page": "{page}"
+        },
+        "method": "get"
+    },
+    "工人日报": {
+        "search_url": "https://www.workercn.cn/search",
+        "search_params": {
+            "keyword": "{keyword}",
+            "page": "{page}"
+        },
+        "method": "get"
+    },
+    "科技日报": {
+        "search_url": "http://www.stdaily.com/search",
+        "search_params": {
+            "keyword": "{keyword}",
+            "page": "{page}"
+        },
+        "method": "get"
+    },
+    "人民政协网": {
+        "search_url": "https://www.rmzxw.com.cn/search",
+        "search_params": {
+            "keyword": "{keyword}",
+            "page": "{page}"
+        },
+        "method": "get"
+    },
+    "中国纪检监察报": {
+        "search_url": "http://www.jjjcb.cn/search",
+        "search_params": {
+            "keyword": "{keyword}",
+            "page": "{page}"
+        },
+        "method": "get"
+    },
+    "中国新闻社": {
+        "search_url": "https://www.chinanews.com.cn/search",
+        "search_params": {
+            "keyword": "{keyword}",
+            "page": "{page}"
+        },
+        "method": "get"
+    },
+    "学习时报": {
+        "search_url": "https://www.studytimes.cn/search",
+        "search_params": {
+            "keyword": "{keyword}",
+            "page": "{page}"
+        },
+        "method": "get"
+    },
+    "中国青年报": {
+        "search_url": "http://news.cyol.com/search",
+        "search_params": {
+            "keyword": "{keyword}",
+            "page": "{page}"
+        },
+        "method": "get"
+    },
+    "中国妇女报": {
+        "search_url": "https://www.cnwomen.com.cn/search",
+        "search_params": {
+            "keyword": "{keyword}",
+            "page": "{page}"
+        },
+        "method": "get"
+    },
+    "法治日报": {
+        "search_url": "http://www.legaldaily.com.cn/search",
+        "search_params": {
+            "keyword": "{keyword}",
+            "page": "{page}"
+        },
+        "method": "get"
+    },
+    "农民日报": {
+        "search_url": "https://www.farmer.com.cn/search",
+        "search_params": {
+            "keyword": "{keyword}",
+            "page": "{page}"
+        },
+        "method": "get"
+    },
+    "学习强国": {
+        "search_url": "https://www.xuexi.cn/search",
+        "search_params": {
+            "keyword": "{keyword}",
+            "page": "{page}"
+        },
+        "method": "get"
+    },
+    "旗帜网": {
+        "search_url": "http://www.qizhiwang.org.cn/search",
+        "search_params": {
+            "keyword": "{keyword}",
+            "page": "{page}"
+        },
+        "method": "get"
+    },
+    "中国网": {
+        "search_url": "http://www.china.com.cn/search",
+        "search_params": {
+            "keyword": "{keyword}",
+            "page": "{page}"
+        },
+        "method": "get"
+    },
+    "中国政府网": {
+        "search_url": "https://www.gov.cn/search",
+        "search_params": {
+            "keyword": "{keyword}",
+            "page": "{page}"
+        },
+        "method": "get"
+    },
+    "求是网": {
+        "search_url": "http://www.qstheory.cn/search",
+        "search_params": {
+            "keyword": "{keyword}",
+            "page": "{page}"
+        },
+        "method": "get"
+    },
+    "解放军报": {
+        "search_url": "http://www.81.cn/search",
+        "search_params": {
+            "keyword": "{keyword}",
+            "page": "{page}"
+        },
+        "method": "get"
+    }
+}
+
+
+def search_articles_by_keyword(website_name, keyword, max_pages=10, start_date=None, end_date=None):
+    """
+    根据关键词搜索文章
+    
+    Args:
+        website_name: 网站名称
+        keyword: 搜索关键词
+        max_pages: 最大搜索页数
+        start_date: 开始日期 (YYYY-MM-DD)
+        end_date: 结束日期 (YYYY-MM-DD)
+    
+    Returns:
+        list: 搜索到的文章URL列表
+    """
+    if website_name not in WEBSITE_SEARCH_CONFIGS:
+        print(f"网站 {website_name} 不支持搜索功能")
+        return []
+    
+    config = WEBSITE_SEARCH_CONFIGS[website_name]
+    article_urls = []
+    
+    # 设置默认日期范围
+    if not start_date:
+        start_date = (datetime.now() - timedelta(days=365)).strftime("%Y-%m-%d")
+    if not end_date:
+        end_date = datetime.now().strftime("%Y-%m-%d")
+    
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
+    }
+    headers.update(config.get("headers", {}))
+    
+    for page in range(1, max_pages + 1):
+        try:
+            # 构建搜索参数
+            search_params = {}
+            for key, value in config["search_params"].items():
+                search_params[key] = value.format(
+                    keyword=quote(keyword),
+                    page=page,
+                    start_date=start_date,
+                    end_date=end_date
+                )
+            
+            print(f"搜索 {website_name} 第 {page} 页: {keyword}")
+            
+            if config["method"] == "post":
+                response = requests.post(
+                    config["search_url"],
+                    data=search_params,
+                    headers=headers,
+                    timeout=15
+                )
+            else:
+                response = requests.get(
+                    config["search_url"],
+                    params=search_params,
+                    headers=headers,
+                    timeout=15
+                )
+            
+            response.raise_for_status()
+            response.encoding = 'utf-8'
+            
+            # 解析搜索结果
+            soup = BeautifulSoup(response.text, "html.parser")
+            page_urls = extract_search_results(soup, website_name)
+            
+            if not page_urls:
+                print(f"第 {page} 页没有找到更多结果")
+                break
+            
+            article_urls.extend(page_urls)
+            print(f"第 {page} 页找到 {len(page_urls)} 篇文章")
+            
+            # 避免请求过快
+            time.sleep(1)
+            
+        except Exception as e:
+            print(f"搜索第 {page} 页时出错: {e}")
+            continue
+    
+    print(f"总共找到 {len(article_urls)} 篇文章")
+    return article_urls
+
+
+def extract_search_results(soup, website_name):
+    """
+    从搜索结果页面提取文章链接
+    
+    Args:
+        soup: BeautifulSoup对象
+        website_name: 网站名称
+    
+    Returns:
+        list: 文章URL列表
+    """
+    urls = []
+    
+    # 根据不同网站的搜索结果结构提取链接
+    if website_name == "新华网":
+        # 新华网搜索结果结构
+        for link in soup.find_all("a", href=True):
+            href = link["href"]
+            if "/news/" in href or "/article/" in href:
+                urls.append(href)
+    
+    elif website_name == "人民日报":
+        # 人民日报搜索结果结构
+        for link in soup.find_all("a", href=True):
+            href = link["href"]
+            if "/n1/" in href or "/article/" in href:
+                urls.append(href)
+    
+    elif website_name == "央视网":
+        # 央视网搜索结果结构
+        for link in soup.find_all("a", href=True):
+            href = link["href"]
+            if "/news/" in href or "ARTI" in href:
+                urls.append(href)
+    
+    elif website_name == "光明日报":
+        # 光明日报搜索结果结构
+        for link in soup.find_all("a", href=True):
+            href = link["href"]
+            if "/article/" in href or "/content/" in href:
+                urls.append(href)
+    
+    elif website_name == "经济日报":
+        # 经济日报搜索结果结构
+        for link in soup.find_all("a", href=True):
+            href = link["href"]
+            if "/article/" in href or "/content/" in href:
+                urls.append(href)
+    
+    elif website_name == "中国日报":
+        # 中国日报搜索结果结构
+        for link in soup.find_all("a", href=True):
+            href = link["href"]
+            if "/article/" in href or "/content/" in href:
+                urls.append(href)
+    
+    elif website_name == "工人日报":
+        # 工人日报搜索结果结构
+        for link in soup.find_all("a", href=True):
+            href = link["href"]
+            if "/c/" in href or "/article/" in href:
+                urls.append(href)
+    
+    elif website_name == "科技日报":
+        # 科技日报搜索结果结构
+        for link in soup.find_all("a", href=True):
+            href = link["href"]
+            if "/article/" in href or "/content/" in href:
+                urls.append(href)
+    
+    elif website_name == "人民政协网":
+        # 人民政协网搜索结果结构
+        for link in soup.find_all("a", href=True):
+            href = link["href"]
+            if "/article/" in href or "/content/" in href:
+                urls.append(href)
+    
+    elif website_name == "中国纪检监察报":
+        # 中国纪检监察报搜索结果结构
+        for link in soup.find_all("a", href=True):
+            href = link["href"]
+            if "/article/" in href or "/content/" in href:
+                urls.append(href)
+    
+    elif website_name == "中国新闻社":
+        # 中国新闻社搜索结果结构
+        for link in soup.find_all("a", href=True):
+            href = link["href"]
+            if "/article/" in href or "/content/" in href:
+                urls.append(href)
+    
+    elif website_name == "学习时报":
+        # 学习时报搜索结果结构
+        for link in soup.find_all("a", href=True):
+            href = link["href"]
+            if "/article/" in href or "/content/" in href:
+                urls.append(href)
+    
+    elif website_name == "中国青年报":
+        # 中国青年报搜索结果结构
+        for link in soup.find_all("a", href=True):
+            href = link["href"]
+            if "/article/" in href or "/content/" in href:
+                urls.append(href)
+    
+    elif website_name == "中国妇女报":
+        # 中国妇女报搜索结果结构
+        for link in soup.find_all("a", href=True):
+            href = link["href"]
+            if "/article/" in href or "/content/" in href:
+                urls.append(href)
+    
+    elif website_name == "法治日报":
+        # 法治日报搜索结果结构
+        for link in soup.find_all("a", href=True):
+            href = link["href"]
+            if "/content/" in href and "content_" in href:
+                urls.append(href)
+    
+    elif website_name == "农民日报":
+        # 农民日报搜索结果结构
+        for link in soup.find_all("a", href=True):
+            href = link["href"]
+            if "/article/" in href or "/content/" in href:
+                urls.append(href)
+    
+    elif website_name == "学习强国":
+        # 学习强国搜索结果结构
+        for link in soup.find_all("a", href=True):
+            href = link["href"]
+            if "/article/" in href or "/content/" in href:
+                urls.append(href)
+    
+    elif website_name == "旗帜网":
+        # 旗帜网搜索结果结构
+        for link in soup.find_all("a", href=True):
+            href = link["href"]
+            if "/n1/" in href or "/article/" in href:
+                urls.append(href)
+    
+    elif website_name == "中国网":
+        # 中国网搜索结果结构
+        for link in soup.find_all("a", href=True):
+            href = link["href"]
+            if "/article/" in href or "/opinion/" in href:
+                urls.append(href)
+    
+    elif website_name == "中国政府网":
+        # 中国政府网搜索结果结构
+        for link in soup.find_all("a", href=True):
+            href = link["href"]
+            if "/zhengce/" in href or "/xinwen/" in href:
+                urls.append(href)
+    
+    elif website_name == "求是网":
+        # 求是网搜索结果结构
+        for link in soup.find_all("a", href=True):
+            href = link["href"]
+            if "/article/" in href or "/content/" in href:
+                urls.append(href)
+    
+    elif website_name == "解放军报":
+        # 解放军报搜索结果结构
+        for link in soup.find_all("a", href=True):
+            href = link["href"]
+            if "/zt/" in href or "/article/" in href:
+                urls.append(href)
+    
+    # 去重并返回
+    return list(set(urls))
+
+
+def crawl_by_keyword(keyword, website_names=None, max_pages=10, start_date=None, end_date=None, max_articles=100):
+    """
+    根据关键词爬取多个网站的文章
+    
+    Args:
+        keyword: 搜索关键词
+        website_names: 网站名称列表，如果为None则爬取所有支持的网站
+        max_pages: 每个网站最大搜索页数
+        start_date: 开始日期 (YYYY-MM-DD)
+        end_date: 结束日期 (YYYY-MM-DD)
+        max_articles: 最大文章数量
+    
+    Returns:
+        dict: 爬取结果统计
+    """
+    if website_names is None:
+        website_names = list(WEBSITE_SEARCH_CONFIGS.keys())
+    
+    results = {
+        "keyword": keyword,
+        "total_articles": 0,
+        "success_count": 0,
+        "failed_count": 0,
+        "website_results": {}
+    }
+    
+    print(f"开始根据关键词 '{keyword}' 爬取文章...")
+    print(f"目标网站: {', '.join(website_names)}")
+    
+    for website_name in website_names:
+        print(f"\n开始爬取 {website_name}...")
+        
+        try:
+            # 获取或创建网站对象
+            from core.models import Website
+            website, created = Website.objects.get_or_create(
+                name=website_name,
+                defaults={
+                    'base_url': WEBSITE_SEARCH_CONFIGS[website_name]["search_url"],
+                    'enabled': True
+                }
+            )
+            
+            # 搜索文章URL
+            article_urls = search_articles_by_keyword(
+                website_name, keyword, max_pages, start_date, end_date
+            )
+            
+            if not article_urls:
+                print(f"{website_name} 没有找到相关文章")
+                results["website_results"][website_name] = {
+                    "found_urls": 0,
+                    "processed": 0,
+                    "success": 0,
+                    "failed": 0
+                }
+                continue
+            
+            # 限制文章数量
+            if len(article_urls) > max_articles:
+                article_urls = article_urls[:max_articles]
+            
+            print(f"{website_name} 找到 {len(article_urls)} 篇文章，开始处理...")
+            
+            website_success = 0
+            website_failed = 0
+            
+            for i, url in enumerate(article_urls, 1):
+                try:
+                    print(f"处理第 {i}/{len(article_urls)} 篇: {url}")
+                    process_article(url, website)
+                    website_success += 1
+                    results["success_count"] += 1
+                    
+                    # 避免请求过快
+                    time.sleep(0.5)
+                    
+                except Exception as e:
+                    print(f"处理文章失败: {url}, 错误: {e}")
+                    website_failed += 1
+                    results["failed_count"] += 1
+            
+            results["website_results"][website_name] = {
+                "found_urls": len(article_urls),
+                "processed": len(article_urls),
+                "success": website_success,
+                "failed": website_failed
+            }
+            
+            print(f"{website_name} 完成: 成功 {website_success}, 失败 {website_failed}")
+            
+        except Exception as e:
+            print(f"爬取 {website_name} 时出错: {e}")
+            results["website_results"][website_name] = {
+                "found_urls": 0,
+                "processed": 0,
+                "success": 0,
+                "failed": 1,
+                "error": str(e)
+            }
+            results["failed_count"] += 1
+    
+    results["total_articles"] = results["success_count"] + results["failed_count"]
+    
+    print(f"\n爬取完成!")
+    print(f"关键词: {keyword}")
+    print(f"总文章数: {results['total_articles']}")
+    print(f"成功: {results['success_count']}")
+    print(f"失败: {results['failed_count']}")
+    
+    return results
+
+
+def crawl_historical_articles(website_names=None, start_date=None, end_date=None, max_articles_per_site=50):
+    """
+    爬取历史文章
+    
+    Args:
+        website_names: 网站名称列表
+        start_date: 开始日期 (YYYY-MM-DD)
+        end_date: 结束日期 (YYYY-MM-DD)
+        max_articles_per_site: 每个网站最大文章数
+    
+    Returns:
+        dict: 爬取结果统计
+    """
+    if not start_date:
+        start_date = (datetime.now() - timedelta(days=30)).strftime("%Y-%m-%d")
+    if not end_date:
+        end_date = datetime.now().strftime("%Y-%m-%d")
+    
+    if website_names is None:
+        website_names = list(WEBSITE_SEARCH_CONFIGS.keys())
+    
+    results = {
+        "start_date": start_date,
+        "end_date": end_date,
+        "total_articles": 0,
+        "success_count": 0,
+        "failed_count": 0,
+        "website_results": {}
+    }
+    
+    print(f"开始爬取历史文章...")
+    print(f"日期范围: {start_date} 到 {end_date}")
+    print(f"目标网站: {', '.join(website_names)}")
+    
+    # 使用通用关键词搜索历史文章
+    common_keywords = ["新闻", "报道", "文章", "资讯", "动态"]
+    
+    for website_name in website_names:
+        print(f"\n开始爬取 {website_name} 历史文章...")
+        
+        try:
+            from core.models import Website
+            website, created = Website.objects.get_or_create(
+                name=website_name,
+                defaults={
+                    'base_url': WEBSITE_SEARCH_CONFIGS[website_name]["search_url"],
+                    'enabled': True
+                }
+            )
+            
+            website_success = 0
+            website_failed = 0
+            all_urls = set()
+            
+            # 使用多个关键词搜索
+            for keyword in common_keywords:
+                try:
+                    article_urls = search_articles_by_keyword(
+                        website_name, keyword, max_pages=5, 
+                        start_date=start_date, end_date=end_date
+                    )
+                    all_urls.update(article_urls)
+                    
+                    if len(all_urls) >= max_articles_per_site:
+                        break
+                        
+                except Exception as e:
+                    print(f"搜索关键词 '{keyword}' 时出错: {e}")
+                    continue
+            
+            # 限制文章数量
+            article_urls = list(all_urls)[:max_articles_per_site]
+            
+            if not article_urls:
+                print(f"{website_name} 没有找到历史文章")
+                results["website_results"][website_name] = {
+                    "found_urls": 0,
+                    "processed": 0,
+                    "success": 0,
+                    "failed": 0
+                }
+                continue
+            
+            print(f"{website_name} 找到 {len(article_urls)} 篇历史文章，开始处理...")
+            
+            for i, url in enumerate(article_urls, 1):
+                try:
+                    print(f"处理第 {i}/{len(article_urls)} 篇: {url}")
+                    process_article(url, website)
+                    website_success += 1
+                    results["success_count"] += 1
+                    
+                    time.sleep(0.5)
+                    
+                except Exception as e:
+                    print(f"处理文章失败: {url}, 错误: {e}")
+                    website_failed += 1
+                    results["failed_count"] += 1
+            
+            results["website_results"][website_name] = {
+                "found_urls": len(article_urls),
+                "processed": len(article_urls),
+                "success": website_success,
+                "failed": website_failed
+            }
+            
+            print(f"{website_name} 完成: 成功 {website_success}, 失败 {website_failed}")
+            
+        except Exception as e:
+            print(f"爬取 {website_name} 历史文章时出错: {e}")
+            results["website_results"][website_name] = {
+                "found_urls": 0,
+                "processed": 0,
+                "success": 0,
+                "failed": 1,
+                "error": str(e)
+            }
+            results["failed_count"] += 1
+    
+    results["total_articles"] = results["success_count"] + results["failed_count"]
+    
+    print(f"\n历史文章爬取完成!")
+    print(f"日期范围: {start_date} 到 {end_date}")
+    print(f"总文章数: {results['total_articles']}")
+    print(f"成功: {results['success_count']}")
+    print(f"失败: {results['failed_count']}")
+    
+    return results