support keyword crawl

2025-09-26 10:39:36 +08:00
parent 499454ff27
commit a36d730384
12 changed files with 2370 additions and 505 deletions
--- a/core/management/commands/crawl_all_websites.py
+++ b/core/management/commands/crawl_all_websites.py
@@ -1,71 +1,64 @@
+import json
 from django.core.management.base import BaseCommand
 from core.utils import full_site_crawler, crawl_by_keyword, WEBSITE_SEARCH_CONFIGS
-from core.models import Website
-import json


 class Command(BaseCommand):
-    help = "一键爬取所有支持的网站"
+    help = '一键爬取所有网站'

    def add_arguments(self, parser):
        parser.add_argument(
            '--mode', '-m',
            type=str,
+            default='both',
            choices=['full', 'keyword', 'both'],
-            default='full',
-            help='爬取模式: full(全站爬取), keyword(关键词爬取), both(两种模式)'
+            help='爬取模式: full(全站爬取), keyword(关键词搜索), both(两者都执行)'
        )
        
        parser.add_argument(
            '--keyword', '-k',
            type=str,
-            help='关键词搜索模式下的搜索关键词'
+            help='关键词搜索的关键词'
        )
        
        parser.add_argument(
            '--websites', '-w',
            type=str,
            nargs='*',
-            help='指定要爬取的网站名称列表，如果不指定则爬取所有支持的网站'
+            help='指定要爬取的网站列表'
        )
        
        parser.add_argument(
            '--max-pages', '-p',
            type=int,
            default=500,
-            help='全站爬取最大页数 (默认: 500)'
+            help='全站爬取的最大页数'
        )
        
        parser.add_argument(
-            '--max-search-pages', '-sp',
+            '--max-search-pages', '-P',
            type=int,
            default=10,
-            help='关键词搜索最大页数 (默认: 10)'
+            help='关键词搜索的最大页数'
        )
        
        parser.add_argument(
            '--max-articles', '-a',
            type=int,
            default=100,
-            help='关键词搜索最大文章数量 (默认: 100)'
+            help='关键词搜索的最大文章数'
        )
        
        parser.add_argument(
            '--start-date', '-s',
            type=str,
-            help='开始日期 (格式: YYYY-MM-DD)'
+            help='开始日期 (YYYY-MM-DD)'
        )
        
        parser.add_argument(
            '--end-date', '-e',
            type=str,
-            help='结束日期 (格式: YYYY-MM-DD)'
-        )
-        
-        parser.add_argument(
-            '--list-websites', '-l',
-            action='store_true',
-            help='列出所有支持的网站'
+            help='结束日期 (YYYY-MM-DD)'
        )
        
        parser.add_argument(
@@ -75,9 +68,15 @@ class Command(BaseCommand):
        )
        
        parser.add_argument(
-            '--skip-existing',
+            '--skip-existing', '-S',
            action='store_true',
-            help='跳过已存在的网站配置'
+            help='跳过已存在的网站'
+        )
+        
+        parser.add_argument(
+            '--list-websites', '-l',
+            action='store_true',
+            help='列出所有支持的网站'
        )

    def handle(self, *args, **options):
@@ -162,6 +161,7 @@ class Command(BaseCommand):
                self.stdout.write(f"{'='*50}")
                
                # 获取或创建网站对象
+                from core.models import Website
                website, created = Website.objects.get_or_create(
                    name=website_name,
                    defaults={
@@ -263,4 +263,4 @@ class Command(BaseCommand):
            
        except Exception as e:
            self.stdout.write(self.style.ERROR(f"一键爬取过程中出现错误: {e}"))
-            raise
+            raise
--- a/core/management/commands/crawl_by_keyword.py
+++ b/core/management/commands/crawl_by_keyword.py
@@ -1,15 +1,16 @@
-from django.core.management.base import BaseCommand
-from core.utils import crawl_by_keyword, crawl_historical_articles, WEBSITE_SEARCH_CONFIGS
 import json
+from django.core.management.base import BaseCommand
+from core.utils import crawl_by_keyword, WEBSITE_SEARCH_CONFIGS


 class Command(BaseCommand):
-    help = "根据关键词爬取多个网站的文章"
+    help = '根据关键词爬取文章'

    def add_arguments(self, parser):
        parser.add_argument(
            '--keyword', '-k',
            type=str,
+            required=True,
            help='搜索关键词'
        )
        
@@ -17,39 +18,39 @@ class Command(BaseCommand):
            '--websites', '-w',
            type=str,
            nargs='*',
-            help='指定要爬取的网站名称列表，如果不指定则爬取所有支持的网站'
+            help='指定要爬取的网站列表'
        )
        
        parser.add_argument(
            '--max-pages', '-p',
            type=int,
            default=10,
-            help='每个网站最大搜索页数 (默认: 10)'
+            help='每个网站最大搜索页数'
        )
        
        parser.add_argument(
-            '--max-articles', '-a',
+            '--max-articles', '-m',
            type=int,
            default=100,
-            help='最大文章数量 (默认: 100)'
+            help='最大文章数量'
        )
        
        parser.add_argument(
            '--start-date', '-s',
            type=str,
-            help='开始日期 (格式: YYYY-MM-DD)'
+            help='开始日期 (YYYY-MM-DD)'
        )
        
        parser.add_argument(
            '--end-date', '-e',
            type=str,
-            help='结束日期 (格式: YYYY-MM-DD)'
+            help='结束日期 (YYYY-MM-DD)'
        )
        
        parser.add_argument(
-            '--historical',
+            '--historical', '-H',
            action='store_true',
-            help='爬取历史文章模式'
+            help='使用历史文章爬取模式'
        )
        
        parser.add_argument(
@@ -121,6 +122,7 @@ class Command(BaseCommand):
            if historical:
                # 历史文章爬取模式
                self.stdout.write(self.style.WARNING("使用历史文章爬取模式"))
+                from core.utils import crawl_historical_articles
                results = crawl_historical_articles(
                    website_names=websites,
                    start_date=start_date,
@@ -163,4 +165,4 @@ class Command(BaseCommand):
            
        except Exception as e:
            self.stdout.write(self.style.ERROR(f"爬取过程中出现错误: {e}"))
-            raise
+            raise