Support keword

2025-09-24 03:38:32 +08:00
parent a4891b1c30
commit 8592833d74
16 changed files with 2888 additions and 2 deletions
--- a/core/management/commands/crawl_all_websites.py
+++ b/core/management/commands/crawl_all_websites.py
@@ -0,0 +1,257 @@
+from django.core.management.base import BaseCommand
+from core.utils import full_site_crawler, crawl_by_keyword, WEBSITE_SEARCH_CONFIGS
+from core.models import Website
+import json
+
+
+class Command(BaseCommand):
+    help = "一键爬取所有支持的网站"
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            '--mode', '-m',
+            type=str,
+            choices=['full', 'keyword', 'both'],
+            default='full',
+            help='爬取模式: full(全站爬取), keyword(关键词爬取), both(两种模式)'
+        )
+        
+        parser.add_argument(
+            '--keyword', '-k',
+            type=str,
+            help='关键词搜索模式下的搜索关键词'
+        )
+        
+        parser.add_argument(
+            '--websites', '-w',
+            type=str,
+            nargs='*',
+            help='指定要爬取的网站名称列表，如果不指定则爬取所有支持的网站'
+        )
+        
+        parser.add_argument(
+            '--max-pages', '-p',
+            type=int,
+            default=500,
+            help='全站爬取最大页数 (默认: 500)'
+        )
+        
+        parser.add_argument(
+            '--max-search-pages', '-sp',
+            type=int,
+            default=10,
+            help='关键词搜索最大页数 (默认: 10)'
+        )
+        
+        parser.add_argument(
+            '--max-articles', '-a',
+            type=int,
+            default=100,
+            help='关键词搜索最大文章数量 (默认: 100)'
+        )
+        
+        parser.add_argument(
+            '--start-date', '-s',
+            type=str,
+            help='开始日期 (格式: YYYY-MM-DD)'
+        )
+        
+        parser.add_argument(
+            '--end-date', '-e',
+            type=str,
+            help='结束日期 (格式: YYYY-MM-DD)'
+        )
+        
+        parser.add_argument(
+            '--list-websites', '-l',
+            action='store_true',
+            help='列出所有支持的网站'
+        )
+        
+        parser.add_argument(
+            '--output', '-o',
+            type=str,
+            help='将结果保存到JSON文件'
+        )
+        
+        parser.add_argument(
+            '--skip-existing',
+            action='store_true',
+            help='跳过已存在的网站配置'
+        )
+
+    def handle(self, *args, **options):
+        # 列出支持的网站
+        if options['list_websites']:
+            self.stdout.write(self.style.SUCCESS("支持的网站列表:"))
+            for i, website in enumerate(WEBSITE_SEARCH_CONFIGS.keys(), 1):
+                self.stdout.write(f"{i:2d}. {website}")
+            return
+        
+        mode = options['mode']
+        keyword = options['keyword']
+        websites = options['websites']
+        max_pages = options['max_pages']
+        max_search_pages = options['max_search_pages']
+        max_articles = options['max_articles']
+        start_date = options['start_date']
+        end_date = options['end_date']
+        output_file = options['output']
+        skip_existing = options['skip_existing']
+        
+        # 验证网站名称
+        if websites:
+            invalid_websites = [w for w in websites if w not in WEBSITE_SEARCH_CONFIGS]
+            if invalid_websites:
+                self.stdout.write(
+                    self.style.ERROR(f"不支持的网站: {', '.join(invalid_websites)}")
+                )
+                self.stdout.write("使用 --list-websites 查看支持的网站列表")
+                return
+        
+        # 确定要爬取的网站列表
+        target_websites = websites if websites else list(WEBSITE_SEARCH_CONFIGS.keys())
+        
+        # 验证关键词模式
+        if mode in ['keyword', 'both'] and not keyword:
+            self.stdout.write(
+                self.style.ERROR("关键词模式需要指定 --keyword 参数")
+            )
+            return
+        
+        self.stdout.write(f"开始一键爬取任务...")
+        self.stdout.write(f"爬取模式: {mode}")
+        self.stdout.write(f"目标网站: {', '.join(target_websites)}")
+        if keyword:
+            self.stdout.write(f"关键词: {keyword}")
+        if start_date:
+            self.stdout.write(f"开始日期: {start_date}")
+        if end_date:
+            self.stdout.write(f"结束日期: {end_date}")
+        
+        all_results = {
+            "mode": mode,
+            "websites": target_websites,
+            "keyword": keyword,
+            "start_date": start_date,
+            "end_date": end_date,
+            "full_crawl_results": {},
+            "keyword_crawl_results": {},
+            "summary": {
+                "total_websites": len(target_websites),
+                "full_crawl_success": 0,
+                "full_crawl_failed": 0,
+                "keyword_crawl_success": 0,
+                "keyword_crawl_failed": 0
+            }
+        }
+        
+        try:
+            for website_name in target_websites:
+                self.stdout.write(f"\n{'='*50}")
+                self.stdout.write(f"开始处理网站: {website_name}")
+                self.stdout.write(f"{'='*50}")
+                
+                # 获取或创建网站对象
+                website, created = Website.objects.get_or_create(
+                    name=website_name,
+                    defaults={
+                        'base_url': WEBSITE_SEARCH_CONFIGS[website_name]["search_url"],
+                        'enabled': True
+                    }
+                )
+                
+                if not created and skip_existing:
+                    self.stdout.write(f"跳过已存在的网站: {website_name}")
+                    continue
+                
+                website_results = {
+                    "full_crawl": None,
+                    "keyword_crawl": None
+                }
+                
+                # 全站爬取
+                if mode in ['full', 'both']:
+                    self.stdout.write(f"\n开始全站爬取: {website_name}")
+                    try:
+                        full_site_crawler(
+                            WEBSITE_SEARCH_CONFIGS[website_name]["search_url"],
+                            website,
+                            max_pages=max_pages
+                        )
+                        self.stdout.write(self.style.SUCCESS(f"全站爬取完成: {website_name}"))
+                        website_results["full_crawl"] = {"status": "success"}
+                        all_results["summary"]["full_crawl_success"] += 1
+                    except Exception as e:
+                        self.stdout.write(self.style.ERROR(f"全站爬取失败: {website_name}, 错误: {e}"))
+                        website_results["full_crawl"] = {"status": "failed", "error": str(e)}
+                        all_results["summary"]["full_crawl_failed"] += 1
+                
+                # 关键词爬取
+                if mode in ['keyword', 'both']:
+                    self.stdout.write(f"\n开始关键词爬取: {website_name}")
+                    try:
+                        keyword_results = crawl_by_keyword(
+                            keyword=keyword,
+                            website_names=[website_name],
+                            max_pages=max_search_pages,
+                            start_date=start_date,
+                            end_date=end_date,
+                            max_articles=max_articles
+                        )
+                        website_results["keyword_crawl"] = keyword_results
+                        if keyword_results["success_count"] > 0:
+                            all_results["summary"]["keyword_crawl_success"] += 1
+                        else:
+                            all_results["summary"]["keyword_crawl_failed"] += 1
+                    except Exception as e:
+                        self.stdout.write(self.style.ERROR(f"关键词爬取失败: {website_name}, 错误: {e}"))
+                        website_results["keyword_crawl"] = {"status": "failed", "error": str(e)}
+                        all_results["summary"]["keyword_crawl_failed"] += 1
+                
+                all_results["full_crawl_results"][website_name] = website_results["full_crawl"]
+                all_results["keyword_crawl_results"][website_name] = website_results["keyword_crawl"]
+            
+            # 显示最终结果摘要
+            self.stdout.write(f"\n{'='*50}")
+            self.stdout.write(self.style.SUCCESS("一键爬取完成!"))
+            self.stdout.write(f"{'='*50}")
+            self.stdout.write(f"总网站数: {all_results['summary']['total_websites']}")
+            
+            if mode in ['full', 'both']:
+                self.stdout.write(f"全站爬取 - 成功: {all_results['summary']['full_crawl_success']}, "
+                                f"失败: {all_results['summary']['full_crawl_failed']}")
+            
+            if mode in ['keyword', 'both']:
+                self.stdout.write(f"关键词爬取 - 成功: {all_results['summary']['keyword_crawl_success']}, "
+                                f"失败: {all_results['summary']['keyword_crawl_failed']}")
+            
+            # 显示各网站详细结果
+            self.stdout.write("\n各网站详细结果:")
+            for website_name in target_websites:
+                self.stdout.write(f"\n{website_name}:")
+                
+                if mode in ['full', 'both']:
+                    full_result = all_results["full_crawl_results"][website_name]
+                    if full_result and full_result.get("status") == "success":
+                        self.stdout.write(self.style.SUCCESS("  全站爬取: 成功"))
+                    elif full_result:
+                        self.stdout.write(self.style.ERROR(f"  全站爬取: 失败 - {full_result.get('error', '未知错误')}"))
+                
+                if mode in ['keyword', 'both']:
+                    keyword_result = all_results["keyword_crawl_results"][website_name]
+                    if keyword_result and "success_count" in keyword_result:
+                        self.stdout.write(f"  关键词爬取: 成功 {keyword_result['success_count']} 篇, "
+                                        f"失败 {keyword_result['failed_count']} 篇")
+                    elif keyword_result and keyword_result.get("status") == "failed":
+                        self.stdout.write(self.style.ERROR(f"  关键词爬取: 失败 - {keyword_result.get('error', '未知错误')}"))
+            
+            # 保存结果到文件
+            if output_file:
+                with open(output_file, 'w', encoding='utf-8') as f:
+                    json.dump(all_results, f, ensure_ascii=False, indent=2)
+                self.stdout.write(f"\n结果已保存到: {output_file}")
+            
+        except Exception as e:
+            self.stdout.write(self.style.ERROR(f"一键爬取过程中出现错误: {e}"))
+            raise
--- a/core/management/commands/crawl_by_keyword.py
+++ b/core/management/commands/crawl_by_keyword.py
@@ -0,0 +1,157 @@
+from django.core.management.base import BaseCommand
+from core.utils import crawl_by_keyword, crawl_historical_articles, WEBSITE_SEARCH_CONFIGS
+import json
+
+
+class Command(BaseCommand):
+    help = "根据关键词爬取多个网站的文章"
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            '--keyword', '-k',
+            type=str,
+            help='搜索关键词'
+        )
+        
+        parser.add_argument(
+            '--websites', '-w',
+            type=str,
+            nargs='*',
+            help='指定要爬取的网站名称列表，如果不指定则爬取所有支持的网站'
+        )
+        
+        parser.add_argument(
+            '--max-pages', '-p',
+            type=int,
+            default=10,
+            help='每个网站最大搜索页数 (默认: 10)'
+        )
+        
+        parser.add_argument(
+            '--max-articles', '-a',
+            type=int,
+            default=100,
+            help='最大文章数量 (默认: 100)'
+        )
+        
+        parser.add_argument(
+            '--start-date', '-s',
+            type=str,
+            help='开始日期 (格式: YYYY-MM-DD)'
+        )
+        
+        parser.add_argument(
+            '--end-date', '-e',
+            type=str,
+            help='结束日期 (格式: YYYY-MM-DD)'
+        )
+        
+        parser.add_argument(
+            '--historical',
+            action='store_true',
+            help='爬取历史文章模式'
+        )
+        
+        parser.add_argument(
+            '--list-websites', '-l',
+            action='store_true',
+            help='列出所有支持的网站'
+        )
+        
+        parser.add_argument(
+            '--output', '-o',
+            type=str,
+            help='将结果保存到JSON文件'
+        )
+
+    def handle(self, *args, **options):
+        # 列出支持的网站
+        if options['list_websites']:
+            self.stdout.write(self.style.SUCCESS("支持的网站列表:"))
+            for i, website in enumerate(WEBSITE_SEARCH_CONFIGS.keys(), 1):
+                self.stdout.write(f"{i:2d}. {website}")
+            return
+        
+        keyword = options['keyword']
+        if not keyword:
+            self.stdout.write(self.style.ERROR("必须指定 --keyword 参数"))
+            return
+        websites = options['websites']
+        max_pages = options['max_pages']
+        max_articles = options['max_articles']
+        start_date = options['start_date']
+        end_date = options['end_date']
+        historical = options['historical']
+        output_file = options['output']
+        
+        # 验证网站名称
+        if websites:
+            invalid_websites = [w for w in websites if w not in WEBSITE_SEARCH_CONFIGS]
+            if invalid_websites:
+                self.stdout.write(
+                    self.style.ERROR(f"不支持的网站: {', '.join(invalid_websites)}")
+                )
+                self.stdout.write("使用 --list-websites 查看支持的网站列表")
+                return
+        
+        self.stdout.write(f"开始爬取任务...")
+        self.stdout.write(f"关键词: {keyword}")
+        if websites:
+            self.stdout.write(f"目标网站: {', '.join(websites)}")
+        else:
+            self.stdout.write(f"目标网站: 所有支持的网站 ({len(WEBSITE_SEARCH_CONFIGS)}个)")
+        
+        if start_date:
+            self.stdout.write(f"开始日期: {start_date}")
+        if end_date:
+            self.stdout.write(f"结束日期: {end_date}")
+        self.stdout.write(f"最大页数: {max_pages}")
+        self.stdout.write(f"最大文章数: {max_articles}")
+        
+        try:
+            if historical:
+                # 历史文章爬取模式
+                self.stdout.write(self.style.WARNING("使用历史文章爬取模式"))
+                results = crawl_historical_articles(
+                    website_names=websites,
+                    start_date=start_date,
+                    end_date=end_date,
+                    max_articles_per_site=max_articles
+                )
+            else:
+                # 关键词搜索模式
+                results = crawl_by_keyword(
+                    keyword=keyword,
+                    website_names=websites,
+                    max_pages=max_pages,
+                    start_date=start_date,
+                    end_date=end_date,
+                    max_articles=max_articles
+                )
+            
+            # 显示结果摘要
+            self.stdout.write(self.style.SUCCESS("\n爬取完成!"))
+            self.stdout.write(f"总文章数: {results['total_articles']}")
+            self.stdout.write(f"成功: {results['success_count']}")
+            self.stdout.write(f"失败: {results['failed_count']}")
+            
+            # 显示各网站详细结果
+            self.stdout.write("\n各网站结果:")
+            for website, result in results['website_results'].items():
+                status = self.style.SUCCESS if result['success'] > 0 else self.style.WARNING
+                self.stdout.write(
+                    status(f"  {website}: 找到 {result['found_urls']} 篇, "
+                          f"成功 {result['success']}, 失败 {result['failed']}")
+                )
+                if 'error' in result:
+                    self.stdout.write(self.style.ERROR(f"    错误: {result['error']}"))
+            
+            # 保存结果到文件
+            if output_file:
+                with open(output_file, 'w', encoding='utf-8') as f:
+                    json.dump(results, f, ensure_ascii=False, indent=2)
+                self.stdout.write(f"\n结果已保存到: {output_file}")
+            
+        except Exception as e:
+            self.stdout.write(self.style.ERROR(f"爬取过程中出现错误: {e}"))
+            raise