diff --git a/core/admin_extended.py b/core/admin_extended.py
index c496a07..feb5f4a 100644
--- a/core/admin_extended.py
+++ b/core/admin_extended.py
@@ -16,9 +16,10 @@ from django.utils import timezone
from django.db.models import Count, Q
from django.core.cache import cache
-from .models import Website, Article
+from .models import Website, Article, CrawlTask
from .tasks import crawl_website, crawl_all_websites, cleanup_old_articles
from .distributed_crawler import distributed_crawler
+from .task_executor import task_executor
logger = logging.getLogger(__name__)
@@ -411,6 +412,419 @@ class ArticleAdmin(admin.ModelAdmin):
actions_column.short_description = '操作'
+class CrawlTaskStatusFilter(SimpleListFilter):
+ """爬取任务状态过滤器"""
+ title = '任务状态'
+ parameter_name = 'status'
+
+ def lookups(self, request, model_admin):
+ return (
+ ('pending', '等待中'),
+ ('running', '运行中'),
+ ('completed', '已完成'),
+ ('failed', '失败'),
+ ('cancelled', '已取消'),
+ )
+
+ def queryset(self, request, queryset):
+ if self.value():
+ return queryset.filter(status=self.value())
+ return queryset
+
+
+class CrawlTaskTypeFilter(SimpleListFilter):
+ """爬取任务类型过滤器"""
+ title = '任务类型'
+ parameter_name = 'task_type'
+
+ def lookups(self, request, model_admin):
+ return (
+ ('keyword', '关键词搜索'),
+ ('historical', '历史文章'),
+ ('full_site', '全站爬取'),
+ )
+
+ def queryset(self, request, queryset):
+ if self.value():
+ return queryset.filter(task_type=self.value())
+ return queryset
+
+
+class CrawlTaskAdmin(admin.ModelAdmin):
+ """爬取任务管理"""
+ list_display = [
+ 'name', 'task_type', 'keyword', 'websites_display', 'status',
+ 'progress_display', 'created_at', 'duration_display', 'actions_column'
+ ]
+ list_filter = [CrawlTaskStatusFilter, CrawlTaskTypeFilter, 'created_at']
+ search_fields = ['name', 'keyword', 'created_by']
+ readonly_fields = [
+ 'status', 'progress', 'current_website', 'current_action',
+ 'total_articles', 'success_count', 'failed_count',
+ 'created_at', 'started_at', 'completed_at', 'error_message',
+ 'result_details', 'duration_display', 'progress_display'
+ ]
+ actions = ['start_tasks', 'cancel_tasks', 'delete_completed_tasks']
+
+ class Media:
+ js = ('admin/js/crawl_task_actions.js',)
+
+ fieldsets = (
+ ('基本信息', {
+ 'fields': ('name', 'task_type', 'keyword')
+ }),
+ ('爬取配置', {
+ 'fields': ('websites', 'start_date', 'end_date', 'max_pages', 'max_articles')
+ }),
+ ('任务状态', {
+ 'fields': ('status', 'progress_display', 'current_website', 'current_action'),
+ 'classes': ('collapse',)
+ }),
+ ('统计信息', {
+ 'fields': ('total_articles', 'success_count', 'failed_count'),
+ 'classes': ('collapse',)
+ }),
+ ('时间信息', {
+ 'fields': ('created_at', 'started_at', 'completed_at', 'duration_display'),
+ 'classes': ('collapse',)
+ }),
+ ('错误信息', {
+ 'fields': ('error_message',),
+ 'classes': ('collapse',)
+ }),
+ ('结果详情', {
+ 'fields': ('result_details',),
+ 'classes': ('collapse',)
+ }),
+ )
+
+ def websites_display(self, obj):
+ """网站列表显示"""
+ return obj.get_websites_display()
+ websites_display.short_description = '目标网站'
+
+ def progress_display(self, obj):
+ """进度显示"""
+ if obj.status == 'running':
+ return format_html(
+ '
',
+ obj.progress, obj.progress
+ )
+ elif obj.status == 'completed':
+ return format_html('✓ 完成')
+ elif obj.status == 'failed':
+ return format_html('✗ 失败')
+ elif obj.status == 'cancelled':
+ return format_html('⊘ 已取消')
+ else:
+ return format_html('⏳ 等待')
+ progress_display.short_description = '进度'
+
+ def duration_display(self, obj):
+ """执行时长显示"""
+ duration = obj.get_duration()
+ if duration:
+ total_seconds = int(duration.total_seconds())
+ hours = total_seconds // 3600
+ minutes = (total_seconds % 3600) // 60
+ seconds = total_seconds % 60
+ if hours > 0:
+ return f"{hours}小时{minutes}分钟"
+ elif minutes > 0:
+ return f"{minutes}分钟{seconds}秒"
+ else:
+ return f"{seconds}秒"
+ return "-"
+ duration_display.short_description = '执行时长'
+
+ def actions_column(self, obj):
+ """操作列"""
+ actions = []
+
+ if obj.status == 'pending':
+ actions.append(f'开始')
+
+ if obj.can_cancel():
+ actions.append(f'取消')
+
+ if obj.status == 'completed':
+ actions.append(f'查看结果')
+
+ return format_html(' '.join(actions))
+ actions_column.short_description = '操作'
+
+ def start_tasks(self, request, queryset):
+ """启动选中的任务"""
+ started_count = 0
+ for task in queryset.filter(status='pending'):
+ try:
+ success, message = task_executor.start_task(task.id)
+ if success:
+ started_count += 1
+ else:
+ self.message_user(request, f'启动任务 {task.name} 失败: {message}', messages.ERROR)
+ except Exception as e:
+ self.message_user(request, f'启动任务 {task.name} 失败: {e}', messages.ERROR)
+
+ if started_count > 0:
+ self.message_user(request, f'成功启动 {started_count} 个任务', messages.SUCCESS)
+ start_tasks.short_description = '启动选中的任务'
+
+ def cancel_tasks(self, request, queryset):
+ """取消选中的任务"""
+ cancelled_count = 0
+ for task in queryset.filter(status__in=['pending', 'running']):
+ try:
+ success, message = task_executor.cancel_task(task.id)
+ if success:
+ cancelled_count += 1
+ else:
+ self.message_user(request, f'取消任务 {task.name} 失败: {message}', messages.ERROR)
+ except Exception as e:
+ self.message_user(request, f'取消任务 {task.name} 失败: {e}', messages.ERROR)
+
+ if cancelled_count > 0:
+ self.message_user(request, f'成功取消 {cancelled_count} 个任务', messages.SUCCESS)
+ cancel_tasks.short_description = '取消选中的任务'
+
+ def delete_completed_tasks(self, request, queryset):
+ """删除已完成的任务"""
+ completed_tasks = queryset.filter(status__in=['completed', 'failed', 'cancelled'])
+ count = completed_tasks.count()
+ completed_tasks.delete()
+
+ if count > 0:
+ self.message_user(request, f'成功删除 {count} 个已完成的任务', messages.SUCCESS)
+ delete_completed_tasks.short_description = '删除已完成的任务'
+
+ def get_urls(self):
+ """添加自定义URL"""
+ urls = super().get_urls()
+ custom_urls = [
+ path(
+ 'create-keyword-task/',
+ self.admin_site.admin_view(self.create_keyword_task_view),
+ name='create_keyword_task',
+ ),
+ path(
+ 'create-historical-task/',
+ self.admin_site.admin_view(self.create_historical_task_view),
+ name='create_historical_task',
+ ),
+ path(
+ 'create-full-site-task/',
+ self.admin_site.admin_view(self.create_full_site_task_view),
+ name='create_full_site_task',
+ ),
+ path(
+ '/start/',
+ self.admin_site.admin_view(self.start_task_view),
+ name='start_task',
+ ),
+ path(
+ '/cancel/',
+ self.admin_site.admin_view(self.cancel_task_view),
+ name='cancel_task',
+ ),
+ path(
+ '/results/',
+ self.admin_site.admin_view(self.view_results_view),
+ name='view_results',
+ ),
+ ]
+ return custom_urls + urls
+
+ def create_keyword_task_view(self, request):
+ """创建关键词搜索任务视图"""
+ if request.method == 'POST':
+ try:
+ from .utils import WEBSITE_SEARCH_CONFIGS
+
+ name = request.POST.get('name', '')
+ keyword = request.POST.get('keyword', '')
+ websites = request.POST.getlist('websites')
+ start_date = request.POST.get('start_date')
+ end_date = request.POST.get('end_date')
+ max_pages = int(request.POST.get('max_pages', 10))
+ max_articles = int(request.POST.get('max_articles', 100))
+
+ if not name or not keyword:
+ self.message_user(request, '任务名称和关键词不能为空', messages.ERROR)
+ return HttpResponseRedirect(reverse('admin:core_crawltask_changelist'))
+
+ # 创建任务
+ task = CrawlTask.objects.create(
+ name=name,
+ task_type='keyword',
+ keyword=keyword,
+ start_date=start_date if start_date else None,
+ end_date=end_date if end_date else None,
+ max_pages=max_pages,
+ max_articles=max_articles,
+ created_by=request.user.username if request.user.is_authenticated else 'admin'
+ )
+
+ # 添加选择的网站
+ if websites:
+ website_objects = Website.objects.filter(name__in=websites)
+ task.websites.set(website_objects)
+
+ self.message_user(request, f'关键词搜索任务 "{name}" 创建成功', messages.SUCCESS)
+ return HttpResponseRedirect(reverse('admin:core_crawltask_change', args=[task.id]))
+
+ except Exception as e:
+ self.message_user(request, f'创建任务失败: {e}', messages.ERROR)
+
+ # GET请求,显示创建表单
+ context = {
+ 'websites': Website.objects.filter(enabled=True),
+ 'title': '创建关键词搜索任务'
+ }
+ return admin.site.admin_view(self.render_create_task_template)(request, 'admin/create_keyword_task.html', context)
+
+ def create_historical_task_view(self, request):
+ """创建历史文章任务视图"""
+ if request.method == 'POST':
+ try:
+ from .utils import WEBSITE_SEARCH_CONFIGS
+
+ name = request.POST.get('name', '')
+ websites = request.POST.getlist('websites')
+ start_date = request.POST.get('start_date')
+ end_date = request.POST.get('end_date')
+ max_articles = int(request.POST.get('max_articles', 50))
+
+ if not name:
+ self.message_user(request, '任务名称不能为空', messages.ERROR)
+ return HttpResponseRedirect(reverse('admin:core_crawltask_changelist'))
+
+ # 创建任务
+ task = CrawlTask.objects.create(
+ name=name,
+ task_type='historical',
+ keyword='历史文章',
+ start_date=start_date if start_date else None,
+ end_date=end_date if end_date else None,
+ max_articles=max_articles,
+ created_by=request.user.username if request.user.is_authenticated else 'admin'
+ )
+
+ # 添加选择的网站
+ if websites:
+ website_objects = Website.objects.filter(name__in=websites)
+ task.websites.set(website_objects)
+
+ self.message_user(request, f'历史文章任务 "{name}" 创建成功', messages.SUCCESS)
+ return HttpResponseRedirect(reverse('admin:core_crawltask_change', args=[task.id]))
+
+ except Exception as e:
+ self.message_user(request, f'创建任务失败: {e}', messages.ERROR)
+
+ # GET请求,显示创建表单
+ context = {
+ 'websites': Website.objects.filter(enabled=True),
+ 'title': '创建历史文章任务'
+ }
+ return admin.site.admin_view(self.render_create_task_template)(request, 'admin/create_historical_task.html', context)
+
+ def create_full_site_task_view(self, request):
+ """创建全站爬取任务视图"""
+ if request.method == 'POST':
+ try:
+ from .utils import WEBSITE_SEARCH_CONFIGS
+
+ name = request.POST.get('name', '')
+ websites = request.POST.getlist('websites')
+ max_pages = int(request.POST.get('max_pages', 500))
+
+ if not name:
+ self.message_user(request, '任务名称不能为空', messages.ERROR)
+ return HttpResponseRedirect(reverse('admin:core_crawltask_changelist'))
+
+ # 创建任务
+ task = CrawlTask.objects.create(
+ name=name,
+ task_type='full_site',
+ keyword='全站爬取',
+ max_pages=max_pages,
+ created_by=request.user.username if request.user.is_authenticated else 'admin'
+ )
+
+ # 添加选择的网站
+ if websites:
+ website_objects = Website.objects.filter(name__in=websites)
+ task.websites.set(website_objects)
+
+ self.message_user(request, f'全站爬取任务 "{name}" 创建成功', messages.SUCCESS)
+ return HttpResponseRedirect(reverse('admin:core_crawltask_change', args=[task.id]))
+
+ except Exception as e:
+ self.message_user(request, f'创建任务失败: {e}', messages.ERROR)
+
+ # GET请求,显示创建表单
+ context = {
+ 'websites': Website.objects.filter(enabled=True),
+ 'title': '创建全站爬取任务'
+ }
+ return admin.site.admin_view(self.render_create_task_template)(request, 'admin/create_full_site_task.html', context)
+
+ def start_task_view(self, request, task_id):
+ """启动任务视图"""
+ try:
+ success, message = task_executor.start_task(task_id)
+ if success:
+ self.message_user(request, f'任务已启动: {message}', messages.SUCCESS)
+ else:
+ self.message_user(request, f'启动任务失败: {message}', messages.ERROR)
+ except Exception as e:
+ self.message_user(request, f'启动任务失败: {e}', messages.ERROR)
+
+ return HttpResponseRedirect(reverse('admin:core_crawltask_changelist'))
+
+ def cancel_task_view(self, request, task_id):
+ """取消任务视图"""
+ try:
+ success, message = task_executor.cancel_task(task_id)
+ if success:
+ self.message_user(request, f'任务已取消: {message}', messages.SUCCESS)
+ else:
+ self.message_user(request, f'取消任务失败: {message}', messages.ERROR)
+ except Exception as e:
+ self.message_user(request, f'取消任务失败: {e}', messages.ERROR)
+
+ return HttpResponseRedirect(reverse('admin:core_crawltask_changelist'))
+
+ def view_results_view(self, request, task_id):
+ """查看结果视图"""
+ try:
+ task = CrawlTask.objects.get(id=task_id)
+ context = {
+ 'task': task,
+ 'title': f'任务结果 - {task.name}'
+ }
+ return admin.site.admin_view(self.render_create_task_template)(request, 'admin/task_results.html', context)
+ except CrawlTask.DoesNotExist:
+ self.message_user(request, '任务不存在', messages.ERROR)
+ return HttpResponseRedirect(reverse('admin:core_crawltask_changelist'))
+
+ def render_create_task_template(self, request, template_name, context):
+ """渲染创建任务模板"""
+ from django.template.loader import render_to_string
+ from django.http import HttpResponse
+
+ context.update({
+ 'site_header': admin.site.site_header,
+ 'site_title': admin.site.site_title,
+ 'has_permission': True,
+ 'user': request.user,
+ })
+
+ html = render_to_string(template_name, context)
+ return HttpResponse(html)
+
+
#class CrawlerStatusAdmin(admin.ModelAdmin):
# """爬虫状态管理"""
# change_list_template = 'admin/crawler_status.html'
@@ -448,6 +862,7 @@ class ArticleAdmin(admin.ModelAdmin):
# 注册管理类
admin.site.register(Website, WebsiteAdmin)
admin.site.register(Article, ArticleAdmin)
+admin.site.register(CrawlTask, CrawlTaskAdmin)
# 隐藏Celery Results管理功能
diff --git a/core/management/commands/crawl_all_websites.py b/core/management/commands/crawl_all_websites.py
new file mode 100644
index 0000000..25e2b0f
--- /dev/null
+++ b/core/management/commands/crawl_all_websites.py
@@ -0,0 +1,257 @@
+from django.core.management.base import BaseCommand
+from core.utils import full_site_crawler, crawl_by_keyword, WEBSITE_SEARCH_CONFIGS
+from core.models import Website
+import json
+
+
+class Command(BaseCommand):
+ help = "一键爬取所有支持的网站"
+
+ def add_arguments(self, parser):
+ parser.add_argument(
+ '--mode', '-m',
+ type=str,
+ choices=['full', 'keyword', 'both'],
+ default='full',
+ help='爬取模式: full(全站爬取), keyword(关键词爬取), both(两种模式)'
+ )
+
+ parser.add_argument(
+ '--keyword', '-k',
+ type=str,
+ help='关键词搜索模式下的搜索关键词'
+ )
+
+ parser.add_argument(
+ '--websites', '-w',
+ type=str,
+ nargs='*',
+ help='指定要爬取的网站名称列表,如果不指定则爬取所有支持的网站'
+ )
+
+ parser.add_argument(
+ '--max-pages', '-p',
+ type=int,
+ default=500,
+ help='全站爬取最大页数 (默认: 500)'
+ )
+
+ parser.add_argument(
+ '--max-search-pages', '-sp',
+ type=int,
+ default=10,
+ help='关键词搜索最大页数 (默认: 10)'
+ )
+
+ parser.add_argument(
+ '--max-articles', '-a',
+ type=int,
+ default=100,
+ help='关键词搜索最大文章数量 (默认: 100)'
+ )
+
+ parser.add_argument(
+ '--start-date', '-s',
+ type=str,
+ help='开始日期 (格式: YYYY-MM-DD)'
+ )
+
+ parser.add_argument(
+ '--end-date', '-e',
+ type=str,
+ help='结束日期 (格式: YYYY-MM-DD)'
+ )
+
+ parser.add_argument(
+ '--list-websites', '-l',
+ action='store_true',
+ help='列出所有支持的网站'
+ )
+
+ parser.add_argument(
+ '--output', '-o',
+ type=str,
+ help='将结果保存到JSON文件'
+ )
+
+ parser.add_argument(
+ '--skip-existing',
+ action='store_true',
+ help='跳过已存在的网站配置'
+ )
+
+ def handle(self, *args, **options):
+ # 列出支持的网站
+ if options['list_websites']:
+ self.stdout.write(self.style.SUCCESS("支持的网站列表:"))
+ for i, website in enumerate(WEBSITE_SEARCH_CONFIGS.keys(), 1):
+ self.stdout.write(f"{i:2d}. {website}")
+ return
+
+ mode = options['mode']
+ keyword = options['keyword']
+ websites = options['websites']
+ max_pages = options['max_pages']
+ max_search_pages = options['max_search_pages']
+ max_articles = options['max_articles']
+ start_date = options['start_date']
+ end_date = options['end_date']
+ output_file = options['output']
+ skip_existing = options['skip_existing']
+
+ # 验证网站名称
+ if websites:
+ invalid_websites = [w for w in websites if w not in WEBSITE_SEARCH_CONFIGS]
+ if invalid_websites:
+ self.stdout.write(
+ self.style.ERROR(f"不支持的网站: {', '.join(invalid_websites)}")
+ )
+ self.stdout.write("使用 --list-websites 查看支持的网站列表")
+ return
+
+ # 确定要爬取的网站列表
+ target_websites = websites if websites else list(WEBSITE_SEARCH_CONFIGS.keys())
+
+ # 验证关键词模式
+ if mode in ['keyword', 'both'] and not keyword:
+ self.stdout.write(
+ self.style.ERROR("关键词模式需要指定 --keyword 参数")
+ )
+ return
+
+ self.stdout.write(f"开始一键爬取任务...")
+ self.stdout.write(f"爬取模式: {mode}")
+ self.stdout.write(f"目标网站: {', '.join(target_websites)}")
+ if keyword:
+ self.stdout.write(f"关键词: {keyword}")
+ if start_date:
+ self.stdout.write(f"开始日期: {start_date}")
+ if end_date:
+ self.stdout.write(f"结束日期: {end_date}")
+
+ all_results = {
+ "mode": mode,
+ "websites": target_websites,
+ "keyword": keyword,
+ "start_date": start_date,
+ "end_date": end_date,
+ "full_crawl_results": {},
+ "keyword_crawl_results": {},
+ "summary": {
+ "total_websites": len(target_websites),
+ "full_crawl_success": 0,
+ "full_crawl_failed": 0,
+ "keyword_crawl_success": 0,
+ "keyword_crawl_failed": 0
+ }
+ }
+
+ try:
+ for website_name in target_websites:
+ self.stdout.write(f"\n{'='*50}")
+ self.stdout.write(f"开始处理网站: {website_name}")
+ self.stdout.write(f"{'='*50}")
+
+ # 获取或创建网站对象
+ website, created = Website.objects.get_or_create(
+ name=website_name,
+ defaults={
+ 'base_url': WEBSITE_SEARCH_CONFIGS[website_name]["search_url"],
+ 'enabled': True
+ }
+ )
+
+ if not created and skip_existing:
+ self.stdout.write(f"跳过已存在的网站: {website_name}")
+ continue
+
+ website_results = {
+ "full_crawl": None,
+ "keyword_crawl": None
+ }
+
+ # 全站爬取
+ if mode in ['full', 'both']:
+ self.stdout.write(f"\n开始全站爬取: {website_name}")
+ try:
+ full_site_crawler(
+ WEBSITE_SEARCH_CONFIGS[website_name]["search_url"],
+ website,
+ max_pages=max_pages
+ )
+ self.stdout.write(self.style.SUCCESS(f"全站爬取完成: {website_name}"))
+ website_results["full_crawl"] = {"status": "success"}
+ all_results["summary"]["full_crawl_success"] += 1
+ except Exception as e:
+ self.stdout.write(self.style.ERROR(f"全站爬取失败: {website_name}, 错误: {e}"))
+ website_results["full_crawl"] = {"status": "failed", "error": str(e)}
+ all_results["summary"]["full_crawl_failed"] += 1
+
+ # 关键词爬取
+ if mode in ['keyword', 'both']:
+ self.stdout.write(f"\n开始关键词爬取: {website_name}")
+ try:
+ keyword_results = crawl_by_keyword(
+ keyword=keyword,
+ website_names=[website_name],
+ max_pages=max_search_pages,
+ start_date=start_date,
+ end_date=end_date,
+ max_articles=max_articles
+ )
+ website_results["keyword_crawl"] = keyword_results
+ if keyword_results["success_count"] > 0:
+ all_results["summary"]["keyword_crawl_success"] += 1
+ else:
+ all_results["summary"]["keyword_crawl_failed"] += 1
+ except Exception as e:
+ self.stdout.write(self.style.ERROR(f"关键词爬取失败: {website_name}, 错误: {e}"))
+ website_results["keyword_crawl"] = {"status": "failed", "error": str(e)}
+ all_results["summary"]["keyword_crawl_failed"] += 1
+
+ all_results["full_crawl_results"][website_name] = website_results["full_crawl"]
+ all_results["keyword_crawl_results"][website_name] = website_results["keyword_crawl"]
+
+ # 显示最终结果摘要
+ self.stdout.write(f"\n{'='*50}")
+ self.stdout.write(self.style.SUCCESS("一键爬取完成!"))
+ self.stdout.write(f"{'='*50}")
+ self.stdout.write(f"总网站数: {all_results['summary']['total_websites']}")
+
+ if mode in ['full', 'both']:
+ self.stdout.write(f"全站爬取 - 成功: {all_results['summary']['full_crawl_success']}, "
+ f"失败: {all_results['summary']['full_crawl_failed']}")
+
+ if mode in ['keyword', 'both']:
+ self.stdout.write(f"关键词爬取 - 成功: {all_results['summary']['keyword_crawl_success']}, "
+ f"失败: {all_results['summary']['keyword_crawl_failed']}")
+
+ # 显示各网站详细结果
+ self.stdout.write("\n各网站详细结果:")
+ for website_name in target_websites:
+ self.stdout.write(f"\n{website_name}:")
+
+ if mode in ['full', 'both']:
+ full_result = all_results["full_crawl_results"][website_name]
+ if full_result and full_result.get("status") == "success":
+ self.stdout.write(self.style.SUCCESS(" 全站爬取: 成功"))
+ elif full_result:
+ self.stdout.write(self.style.ERROR(f" 全站爬取: 失败 - {full_result.get('error', '未知错误')}"))
+
+ if mode in ['keyword', 'both']:
+ keyword_result = all_results["keyword_crawl_results"][website_name]
+ if keyword_result and "success_count" in keyword_result:
+ self.stdout.write(f" 关键词爬取: 成功 {keyword_result['success_count']} 篇, "
+ f"失败 {keyword_result['failed_count']} 篇")
+ elif keyword_result and keyword_result.get("status") == "failed":
+ self.stdout.write(self.style.ERROR(f" 关键词爬取: 失败 - {keyword_result.get('error', '未知错误')}"))
+
+ # 保存结果到文件
+ if output_file:
+ with open(output_file, 'w', encoding='utf-8') as f:
+ json.dump(all_results, f, ensure_ascii=False, indent=2)
+ self.stdout.write(f"\n结果已保存到: {output_file}")
+
+ except Exception as e:
+ self.stdout.write(self.style.ERROR(f"一键爬取过程中出现错误: {e}"))
+ raise
diff --git a/core/management/commands/crawl_by_keyword.py b/core/management/commands/crawl_by_keyword.py
new file mode 100644
index 0000000..9624dfe
--- /dev/null
+++ b/core/management/commands/crawl_by_keyword.py
@@ -0,0 +1,157 @@
+from django.core.management.base import BaseCommand
+from core.utils import crawl_by_keyword, crawl_historical_articles, WEBSITE_SEARCH_CONFIGS
+import json
+
+
+class Command(BaseCommand):
+ help = "根据关键词爬取多个网站的文章"
+
+ def add_arguments(self, parser):
+ parser.add_argument(
+ '--keyword', '-k',
+ type=str,
+ help='搜索关键词'
+ )
+
+ parser.add_argument(
+ '--websites', '-w',
+ type=str,
+ nargs='*',
+ help='指定要爬取的网站名称列表,如果不指定则爬取所有支持的网站'
+ )
+
+ parser.add_argument(
+ '--max-pages', '-p',
+ type=int,
+ default=10,
+ help='每个网站最大搜索页数 (默认: 10)'
+ )
+
+ parser.add_argument(
+ '--max-articles', '-a',
+ type=int,
+ default=100,
+ help='最大文章数量 (默认: 100)'
+ )
+
+ parser.add_argument(
+ '--start-date', '-s',
+ type=str,
+ help='开始日期 (格式: YYYY-MM-DD)'
+ )
+
+ parser.add_argument(
+ '--end-date', '-e',
+ type=str,
+ help='结束日期 (格式: YYYY-MM-DD)'
+ )
+
+ parser.add_argument(
+ '--historical',
+ action='store_true',
+ help='爬取历史文章模式'
+ )
+
+ parser.add_argument(
+ '--list-websites', '-l',
+ action='store_true',
+ help='列出所有支持的网站'
+ )
+
+ parser.add_argument(
+ '--output', '-o',
+ type=str,
+ help='将结果保存到JSON文件'
+ )
+
+ def handle(self, *args, **options):
+ # 列出支持的网站
+ if options['list_websites']:
+ self.stdout.write(self.style.SUCCESS("支持的网站列表:"))
+ for i, website in enumerate(WEBSITE_SEARCH_CONFIGS.keys(), 1):
+ self.stdout.write(f"{i:2d}. {website}")
+ return
+
+ keyword = options['keyword']
+ if not keyword:
+ self.stdout.write(self.style.ERROR("必须指定 --keyword 参数"))
+ return
+ websites = options['websites']
+ max_pages = options['max_pages']
+ max_articles = options['max_articles']
+ start_date = options['start_date']
+ end_date = options['end_date']
+ historical = options['historical']
+ output_file = options['output']
+
+ # 验证网站名称
+ if websites:
+ invalid_websites = [w for w in websites if w not in WEBSITE_SEARCH_CONFIGS]
+ if invalid_websites:
+ self.stdout.write(
+ self.style.ERROR(f"不支持的网站: {', '.join(invalid_websites)}")
+ )
+ self.stdout.write("使用 --list-websites 查看支持的网站列表")
+ return
+
+ self.stdout.write(f"开始爬取任务...")
+ self.stdout.write(f"关键词: {keyword}")
+ if websites:
+ self.stdout.write(f"目标网站: {', '.join(websites)}")
+ else:
+ self.stdout.write(f"目标网站: 所有支持的网站 ({len(WEBSITE_SEARCH_CONFIGS)}个)")
+
+ if start_date:
+ self.stdout.write(f"开始日期: {start_date}")
+ if end_date:
+ self.stdout.write(f"结束日期: {end_date}")
+ self.stdout.write(f"最大页数: {max_pages}")
+ self.stdout.write(f"最大文章数: {max_articles}")
+
+ try:
+ if historical:
+ # 历史文章爬取模式
+ self.stdout.write(self.style.WARNING("使用历史文章爬取模式"))
+ results = crawl_historical_articles(
+ website_names=websites,
+ start_date=start_date,
+ end_date=end_date,
+ max_articles_per_site=max_articles
+ )
+ else:
+ # 关键词搜索模式
+ results = crawl_by_keyword(
+ keyword=keyword,
+ website_names=websites,
+ max_pages=max_pages,
+ start_date=start_date,
+ end_date=end_date,
+ max_articles=max_articles
+ )
+
+ # 显示结果摘要
+ self.stdout.write(self.style.SUCCESS("\n爬取完成!"))
+ self.stdout.write(f"总文章数: {results['total_articles']}")
+ self.stdout.write(f"成功: {results['success_count']}")
+ self.stdout.write(f"失败: {results['failed_count']}")
+
+ # 显示各网站详细结果
+ self.stdout.write("\n各网站结果:")
+ for website, result in results['website_results'].items():
+ status = self.style.SUCCESS if result['success'] > 0 else self.style.WARNING
+ self.stdout.write(
+ status(f" {website}: 找到 {result['found_urls']} 篇, "
+ f"成功 {result['success']}, 失败 {result['failed']}")
+ )
+ if 'error' in result:
+ self.stdout.write(self.style.ERROR(f" 错误: {result['error']}"))
+
+ # 保存结果到文件
+ if output_file:
+ with open(output_file, 'w', encoding='utf-8') as f:
+ json.dump(results, f, ensure_ascii=False, indent=2)
+ self.stdout.write(f"\n结果已保存到: {output_file}")
+
+ except Exception as e:
+ self.stdout.write(self.style.ERROR(f"爬取过程中出现错误: {e}"))
+ raise
diff --git a/core/migrations/0002_crawltask.py b/core/migrations/0002_crawltask.py
new file mode 100644
index 0000000..6a62f0e
--- /dev/null
+++ b/core/migrations/0002_crawltask.py
@@ -0,0 +1,45 @@
+# Generated by Django 5.1 on 2025-09-23 19:28
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('core', '0001_initial'),
+ ]
+
+ operations = [
+ migrations.CreateModel(
+ name='CrawlTask',
+ fields=[
+ ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+ ('name', models.CharField(max_length=200, verbose_name='任务名称')),
+ ('task_type', models.CharField(choices=[('keyword', '关键词搜索'), ('historical', '历史文章'), ('full_site', '全站爬取')], default='keyword', max_length=20, verbose_name='任务类型')),
+ ('keyword', models.CharField(blank=True, max_length=200, null=True, verbose_name='搜索关键词')),
+ ('websites', models.JSONField(default=list, verbose_name='目标网站')),
+ ('start_date', models.DateField(blank=True, null=True, verbose_name='开始日期')),
+ ('end_date', models.DateField(blank=True, null=True, verbose_name='结束日期')),
+ ('max_pages', models.IntegerField(default=10, verbose_name='最大页数')),
+ ('max_articles', models.IntegerField(default=100, verbose_name='最大文章数')),
+ ('status', models.CharField(choices=[('pending', '等待中'), ('running', '运行中'), ('completed', '已完成'), ('failed', '失败'), ('cancelled', '已取消')], default='pending', max_length=20, verbose_name='状态')),
+ ('progress', models.IntegerField(default=0, verbose_name='进度百分比')),
+ ('current_website', models.CharField(blank=True, max_length=100, null=True, verbose_name='当前网站')),
+ ('current_action', models.CharField(blank=True, max_length=200, null=True, verbose_name='当前操作')),
+ ('total_articles', models.IntegerField(default=0, verbose_name='总文章数')),
+ ('success_count', models.IntegerField(default=0, verbose_name='成功数')),
+ ('failed_count', models.IntegerField(default=0, verbose_name='失败数')),
+ ('created_at', models.DateTimeField(auto_now_add=True, verbose_name='创建时间')),
+ ('started_at', models.DateTimeField(blank=True, null=True, verbose_name='开始时间')),
+ ('completed_at', models.DateTimeField(blank=True, null=True, verbose_name='完成时间')),
+ ('error_message', models.TextField(blank=True, null=True, verbose_name='错误信息')),
+ ('result_details', models.JSONField(blank=True, default=dict, verbose_name='结果详情')),
+ ('created_by', models.CharField(blank=True, max_length=100, null=True, verbose_name='创建者')),
+ ],
+ options={
+ 'verbose_name': '爬取任务',
+ 'verbose_name_plural': '爬取任务',
+ 'ordering': ['-created_at'],
+ },
+ ),
+ ]
diff --git a/core/migrations/0003_remove_crawltask_websites_crawltask_websites.py b/core/migrations/0003_remove_crawltask_websites_crawltask_websites.py
new file mode 100644
index 0000000..5d05397
--- /dev/null
+++ b/core/migrations/0003_remove_crawltask_websites_crawltask_websites.py
@@ -0,0 +1,22 @@
+# Generated by Django 5.1 on 2025-09-23 19:34
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('core', '0002_crawltask'),
+ ]
+
+ operations = [
+ migrations.RemoveField(
+ model_name='crawltask',
+ name='websites',
+ ),
+ migrations.AddField(
+ model_name='crawltask',
+ name='websites',
+ field=models.ManyToManyField(blank=True, to='core.website', verbose_name='目标网站'),
+ ),
+ ]
diff --git a/core/models.py b/core/models.py
index a176335..8b037dc 100644
--- a/core/models.py
+++ b/core/models.py
@@ -1,4 +1,6 @@
from django.db import models
+from django.utils import timezone
+import json
class Website(models.Model):
@@ -25,3 +27,93 @@ class Article(models.Model):
def __str__(self):
return self.title
+
+
+class CrawlTask(models.Model):
+ """爬取任务模型"""
+ TASK_STATUS_CHOICES = [
+ ('pending', '等待中'),
+ ('running', '运行中'),
+ ('completed', '已完成'),
+ ('failed', '失败'),
+ ('cancelled', '已取消'),
+ ]
+
+ TASK_TYPE_CHOICES = [
+ ('keyword', '关键词搜索'),
+ ('historical', '历史文章'),
+ ('full_site', '全站爬取'),
+ ]
+
+ name = models.CharField(max_length=200, verbose_name="任务名称")
+ task_type = models.CharField(max_length=20, choices=TASK_TYPE_CHOICES, default='keyword', verbose_name="任务类型")
+ keyword = models.CharField(max_length=200, blank=True, null=True, verbose_name="搜索关键词")
+ websites = models.ManyToManyField(Website, blank=True, verbose_name="目标网站")
+ start_date = models.DateField(blank=True, null=True, verbose_name="开始日期")
+ end_date = models.DateField(blank=True, null=True, verbose_name="结束日期")
+ max_pages = models.IntegerField(default=10, verbose_name="最大页数")
+ max_articles = models.IntegerField(default=100, verbose_name="最大文章数")
+
+ status = models.CharField(max_length=20, choices=TASK_STATUS_CHOICES, default='pending', verbose_name="状态")
+ progress = models.IntegerField(default=0, verbose_name="进度百分比")
+ current_website = models.CharField(max_length=100, blank=True, null=True, verbose_name="当前网站")
+ current_action = models.CharField(max_length=200, blank=True, null=True, verbose_name="当前操作")
+
+ total_articles = models.IntegerField(default=0, verbose_name="总文章数")
+ success_count = models.IntegerField(default=0, verbose_name="成功数")
+ failed_count = models.IntegerField(default=0, verbose_name="失败数")
+
+ created_at = models.DateTimeField(auto_now_add=True, verbose_name="创建时间")
+ started_at = models.DateTimeField(blank=True, null=True, verbose_name="开始时间")
+ completed_at = models.DateTimeField(blank=True, null=True, verbose_name="完成时间")
+
+ error_message = models.TextField(blank=True, null=True, verbose_name="错误信息")
+ result_details = models.JSONField(default=dict, blank=True, verbose_name="结果详情")
+
+ created_by = models.CharField(max_length=100, blank=True, null=True, verbose_name="创建者")
+
+ class Meta:
+ verbose_name = "爬取任务"
+ verbose_name_plural = "爬取任务"
+ ordering = ['-created_at']
+
+ def __str__(self):
+ return f"{self.name} ({self.get_status_display()})"
+
+ def get_websites_display(self):
+ """获取网站列表的显示文本"""
+ websites = self.websites.all()
+ if not websites:
+ return "所有网站"
+ return ", ".join([w.name for w in websites])
+
+ def get_duration(self):
+ """获取任务执行时长"""
+ if not self.started_at:
+ return None
+ end_time = self.completed_at or timezone.now()
+ return end_time - self.started_at
+
+ def is_running(self):
+ """判断任务是否正在运行"""
+ return self.status == 'running'
+
+ def can_cancel(self):
+ """判断任务是否可以取消"""
+ return self.status in ['pending', 'running']
+
+ def get_progress_display(self):
+ """获取进度显示文本"""
+ if self.status == 'pending':
+ return "等待开始"
+ elif self.status == 'running':
+ if self.current_website and self.current_action:
+ return f"正在处理 {self.current_website}: {self.current_action}"
+ return f"运行中 ({self.progress}%)"
+ elif self.status == 'completed':
+ return f"已完成 ({self.success_count}/{self.total_articles})"
+ elif self.status == 'failed':
+ return f"失败: {self.error_message[:50]}..." if self.error_message else "失败"
+ elif self.status == 'cancelled':
+ return "已取消"
+ return "未知状态"
\ No newline at end of file
diff --git a/core/static/admin/js/crawl_task_actions.js b/core/static/admin/js/crawl_task_actions.js
new file mode 100644
index 0000000..bf6b676
--- /dev/null
+++ b/core/static/admin/js/crawl_task_actions.js
@@ -0,0 +1,84 @@
+/**
+ * 爬取任务操作JavaScript
+ */
+
+function startTask(taskId) {
+ if (confirm('确定要启动这个任务吗?')) {
+ fetch(`/admin/core/crawltask/${taskId}/start/`, {
+ method: 'POST',
+ headers: {
+ 'X-CSRFToken': getCookie('csrftoken'),
+ 'Content-Type': 'application/x-www-form-urlencoded',
+ },
+ })
+ .then(response => {
+ if (response.ok) {
+ location.reload();
+ } else {
+ alert('启动任务失败');
+ }
+ })
+ .catch(error => {
+ console.error('Error:', error);
+ alert('启动任务失败');
+ });
+ }
+}
+
+function cancelTask(taskId) {
+ if (confirm('确定要取消这个任务吗?')) {
+ fetch(`/admin/core/crawltask/${taskId}/cancel/`, {
+ method: 'POST',
+ headers: {
+ 'X-CSRFToken': getCookie('csrftoken'),
+ 'Content-Type': 'application/x-www-form-urlencoded',
+ },
+ })
+ .then(response => {
+ if (response.ok) {
+ location.reload();
+ } else {
+ alert('取消任务失败');
+ }
+ })
+ .catch(error => {
+ console.error('Error:', error);
+ alert('取消任务失败');
+ });
+ }
+}
+
+function viewResults(taskId) {
+ window.open(`/admin/core/crawltask/${taskId}/results/`, '_blank');
+}
+
+function getCookie(name) {
+ let cookieValue = null;
+ if (document.cookie && document.cookie !== '') {
+ const cookies = document.cookie.split(';');
+ for (let i = 0; i < cookies.length; i++) {
+ const cookie = cookies[i].trim();
+ if (cookie.substring(0, name.length + 1) === (name + '=')) {
+ cookieValue = decodeURIComponent(cookie.substring(name.length + 1));
+ break;
+ }
+ }
+ }
+ return cookieValue;
+}
+
+// 自动刷新运行中的任务状态
+function autoRefreshRunningTasks() {
+ const runningTasks = document.querySelectorAll('[data-task-status="running"]');
+ if (runningTasks.length > 0) {
+ // 每30秒刷新一次页面
+ setTimeout(() => {
+ location.reload();
+ }, 30000);
+ }
+}
+
+// 页面加载完成后执行
+document.addEventListener('DOMContentLoaded', function() {
+ autoRefreshRunningTasks();
+});
diff --git a/core/task_executor.py b/core/task_executor.py
new file mode 100644
index 0000000..915c3b7
--- /dev/null
+++ b/core/task_executor.py
@@ -0,0 +1,235 @@
+"""
+爬取任务执行器
+负责执行爬取任务并更新任务状态
+"""
+
+import threading
+import time
+from django.utils import timezone
+from django.db import transaction
+from core.models import CrawlTask
+from core.utils import crawl_by_keyword, crawl_historical_articles, full_site_crawler, WEBSITE_SEARCH_CONFIGS
+
+
+class TaskExecutor:
+ """任务执行器"""
+
+ def __init__(self):
+ self.running_tasks = {}
+ self.lock = threading.Lock()
+
+ def start_task(self, task_id):
+ """启动任务"""
+ with self.lock:
+ if task_id in self.running_tasks:
+ return False, "任务已在运行中"
+
+ try:
+ task = CrawlTask.objects.get(id=task_id)
+ if task.status != 'pending':
+ return False, "任务状态不允许启动"
+
+ # 更新任务状态
+ task.status = 'running'
+ task.started_at = timezone.now()
+ task.save()
+
+ # 启动后台线程执行任务
+ thread = threading.Thread(target=self._execute_task, args=(task_id,))
+ thread.daemon = True
+ thread.start()
+
+ self.running_tasks[task_id] = thread
+ return True, "任务已启动"
+
+ except CrawlTask.DoesNotExist:
+ return False, "任务不存在"
+ except Exception as e:
+ return False, f"启动任务失败: {e}"
+
+ def cancel_task(self, task_id):
+ """取消任务"""
+ with self.lock:
+ if task_id in self.running_tasks:
+ # 标记任务为取消状态
+ try:
+ task = CrawlTask.objects.get(id=task_id)
+ task.status = 'cancelled'
+ task.completed_at = timezone.now()
+ task.save()
+
+ # 移除运行中的任务
+ del self.running_tasks[task_id]
+ return True, "任务已取消"
+ except CrawlTask.DoesNotExist:
+ return False, "任务不存在"
+ else:
+ return False, "任务未在运行中"
+
+ def _execute_task(self, task_id):
+ """执行任务的核心逻辑"""
+ try:
+ task = CrawlTask.objects.get(id=task_id)
+
+ # 根据任务类型执行不同的爬取逻辑
+ if task.task_type == 'keyword':
+ self._execute_keyword_task(task)
+ elif task.task_type == 'historical':
+ self._execute_historical_task(task)
+ elif task.task_type == 'full_site':
+ self._execute_full_site_task(task)
+ else:
+ raise ValueError(f"不支持的任务类型: {task.task_type}")
+
+ # 任务完成
+ with transaction.atomic():
+ task = CrawlTask.objects.select_for_update().get(id=task_id)
+ task.status = 'completed'
+ task.completed_at = timezone.now()
+ task.progress = 100
+ task.save()
+
+ except Exception as e:
+ # 任务失败
+ try:
+ with transaction.atomic():
+ task = CrawlTask.objects.select_for_update().get(id=task_id)
+ task.status = 'failed'
+ task.completed_at = timezone.now()
+ task.error_message = str(e)
+ task.save()
+ except:
+ pass
+
+ finally:
+ # 清理运行中的任务记录
+ with self.lock:
+ if task_id in self.running_tasks:
+ del self.running_tasks[task_id]
+
+ def _execute_keyword_task(self, task):
+ """执行关键词搜索任务"""
+ # 更新当前操作
+ task.current_action = "开始关键词搜索"
+ task.save()
+
+ # 准备参数
+ websites = task.websites if task.websites else list(WEBSITE_SEARCH_CONFIGS.keys())
+ start_date = task.start_date.strftime('%Y-%m-%d') if task.start_date else None
+ end_date = task.end_date.strftime('%Y-%m-%d') if task.end_date else None
+
+ # 执行爬取
+ results = crawl_by_keyword(
+ keyword=task.keyword,
+ website_names=websites,
+ max_pages=task.max_pages,
+ start_date=start_date,
+ end_date=end_date,
+ max_articles=task.max_articles
+ )
+
+ # 更新结果
+ task.total_articles = results['total_articles']
+ task.success_count = results['success_count']
+ task.failed_count = results['failed_count']
+ task.result_details = results['website_results']
+ task.save()
+
+ def _execute_historical_task(self, task):
+ """执行历史文章任务"""
+ # 更新当前操作
+ task.current_action = "开始历史文章爬取"
+ task.save()
+
+ # 准备参数
+ websites = task.websites if task.websites else list(WEBSITE_SEARCH_CONFIGS.keys())
+ start_date = task.start_date.strftime('%Y-%m-%d') if task.start_date else None
+ end_date = task.end_date.strftime('%Y-%m-%d') if task.end_date else None
+
+ # 执行爬取
+ results = crawl_historical_articles(
+ website_names=websites,
+ start_date=start_date,
+ end_date=end_date,
+ max_articles_per_site=task.max_articles
+ )
+
+ # 更新结果
+ task.total_articles = results['total_articles']
+ task.success_count = results['success_count']
+ task.failed_count = results['failed_count']
+ task.result_details = results['website_results']
+ task.save()
+
+ def _execute_full_site_task(self, task):
+ """执行全站爬取任务"""
+ # 更新当前操作
+ task.current_action = "开始全站爬取"
+ task.save()
+
+ # 准备参数
+ websites = task.websites if task.websites else list(WEBSITE_SEARCH_CONFIGS.keys())
+
+ total_websites = len(websites)
+ completed_websites = 0
+
+ for website_name in websites:
+ try:
+ # 更新当前网站
+ task.current_website = website_name
+ task.current_action = f"正在爬取 {website_name}"
+ task.save()
+
+ # 获取或创建网站对象
+ from core.models import Website
+ website, created = Website.objects.get_or_create(
+ name=website_name,
+ defaults={
+ 'base_url': WEBSITE_SEARCH_CONFIGS[website_name]["search_url"],
+ 'enabled': True
+ }
+ )
+
+ # 执行全站爬取
+ full_site_crawler(
+ WEBSITE_SEARCH_CONFIGS[website_name]["search_url"],
+ website,
+ max_pages=task.max_pages
+ )
+
+ completed_websites += 1
+ progress = int((completed_websites / total_websites) * 100)
+ task.progress = progress
+ task.save()
+
+ except Exception as e:
+ # 记录错误但继续处理其他网站
+ print(f"爬取网站 {website_name} 时出错: {e}")
+ continue
+
+ # 更新最终结果
+ task.total_articles = completed_websites # 这里可以改为实际爬取的文章数
+ task.success_count = completed_websites
+ task.failed_count = total_websites - completed_websites
+ task.save()
+
+ def get_task_status(self, task_id):
+ """获取任务状态"""
+ try:
+ task = CrawlTask.objects.get(id=task_id)
+ return {
+ 'status': task.status,
+ 'progress': task.progress,
+ 'current_website': task.current_website,
+ 'current_action': task.current_action,
+ 'total_articles': task.total_articles,
+ 'success_count': task.success_count,
+ 'failed_count': task.failed_count,
+ 'error_message': task.error_message
+ }
+ except CrawlTask.DoesNotExist:
+ return None
+
+
+# 全局任务执行器实例
+task_executor = TaskExecutor()
diff --git a/core/templates/admin/create_full_site_task.html b/core/templates/admin/create_full_site_task.html
new file mode 100644
index 0000000..81cde17
--- /dev/null
+++ b/core/templates/admin/create_full_site_task.html
@@ -0,0 +1,139 @@
+{% extends "admin/base_site.html" %}
+{% load i18n admin_urls static admin_modify %}
+
+{% block title %}{{ title }} | {{ site_title|default:_('Django site admin') }}{% endblock %}
+
+{% block breadcrumbs %}
+
+{% endblock %}
+
+{% block content %}
+{{ title }}
+
+
+ 注意:全站爬取会爬取整个网站的所有文章,可能需要很长时间。建议在非高峰时段进行。
+
+
+
+
+
+
+
+{% endblock %}
\ No newline at end of file
diff --git a/core/templates/admin/create_historical_task.html b/core/templates/admin/create_historical_task.html
new file mode 100644
index 0000000..4e8920c
--- /dev/null
+++ b/core/templates/admin/create_historical_task.html
@@ -0,0 +1,164 @@
+{% extends "admin/base_site.html" %}
+{% load i18n admin_urls static admin_modify %}
+
+{% block title %}{{ title }} | {{ site_title|default:_('Django site admin') }}{% endblock %}
+
+{% block breadcrumbs %}
+
+{% endblock %}
+
+{% block content %}
+{{ title }}
+
+
+
+
+
+
+{% endblock %}
\ No newline at end of file
diff --git a/core/templates/admin/create_keyword_task.html b/core/templates/admin/create_keyword_task.html
new file mode 100644
index 0000000..27bdaea
--- /dev/null
+++ b/core/templates/admin/create_keyword_task.html
@@ -0,0 +1,180 @@
+{% extends "admin/base_site.html" %}
+{% load i18n admin_urls static admin_modify %}
+
+{% block title %}{{ title }} | {{ site_title|default:_('Django site admin') }}{% endblock %}
+
+{% block breadcrumbs %}
+
+{% endblock %}
+
+{% block content %}
+{{ title }}
+
+
+
+
+
+
+{% endblock %}
\ No newline at end of file
diff --git a/core/templates/admin/index.html b/core/templates/admin/index.html
new file mode 100644
index 0000000..1b5fa38
--- /dev/null
+++ b/core/templates/admin/index.html
@@ -0,0 +1,172 @@
+{% extends "admin/base_site.html" %}
+{% load i18n static %}
+
+{% block extrastyle %}{{ block.super }}{% endblock %}
+
+{% block coltype %}colMS{% endblock %}
+
+{% block bodyclass %}{{ block.super }} dashboard{% endblock %}
+
+{% block breadcrumbs %}{% endblock %}
+
+{% block nav-sidebar %}{% endblock %}
+
+{% block content %}
+
+
+{% if app_list %}
+ {% for app in app_list %}
+
+ {% endfor %}
+{% else %}
+
{% translate "You don't have permission to view or edit anything." %}
+{% endif %}
+
+
+
+
+
+
+
最近任务状态
+
+ {% load core_extras %}
+ {% get_recent_tasks as recent_tasks %}
+ {% if recent_tasks %}
+
+
+
+ | 任务名称 |
+ 类型 |
+ 状态 |
+ 进度 |
+ 创建时间 |
+ 操作 |
+
+
+
+ {% for task in recent_tasks %}
+
+ | {{ task.name }} |
+ {{ task.get_task_type_display }} |
+
+
+ {{ task.get_status_display }}
+
+ |
+
+ {% if task.status == 'running' %}
+
+
+ {{ task.progress }}%
+
+
+ {% else %}
+ -
+ {% endif %}
+ |
+ {{ task.created_at|date:"m-d H:i" }} |
+
+ 查看
+ |
+
+ {% endfor %}
+
+
+ {% else %}
+
暂无任务
+ {% endif %}
+
+
+
+
+{% endblock %}
+
+{% block sidebar %}
+
+
+
{% translate 'Recent actions' %}
+
{% translate 'My actions' %}
+ {% load log %}
+ {% get_admin_log 10 as admin_log for_user user %}
+ {% if not admin_log %}
+
{% translate 'None available' %}
+ {% else %}
+
+ {% for entry in admin_log %}
+ -
+ {% if entry.is_deletion or not entry.get_admin_url %}
+ {{ entry.object_repr }}
+ {% else %}
+ {{ entry.object_repr }}
+ {% endif %}
+
+ {% if entry.content_type %}
+ {% filter capfirst %}{{ entry.content_type.name }}{% endfilter %}
+ {% else %}
+ {% translate 'Unknown content' %}
+ {% endif %}
+
+ {% endfor %}
+
+ {% endif %}
+
+
+{% endblock %}
diff --git a/core/templates/admin/task_results.html b/core/templates/admin/task_results.html
new file mode 100644
index 0000000..6276c4d
--- /dev/null
+++ b/core/templates/admin/task_results.html
@@ -0,0 +1,184 @@
+{% extends "admin/base_site.html" %}
+{% load i18n admin_urls static admin_modify %}
+
+{% block title %}{{ title }} | {{ site_title|default:_('Django site admin') }}{% endblock %}
+
+{% block breadcrumbs %}
+
+{% endblock %}
+
+{% block content %}
+{{ title }}
+
+
+
任务概览
+
+
+ 任务名称:
+ {{ task.name }}
+
+
+ 任务类型:
+ {{ task.get_task_type_display }}
+
+
+ 状态:
+
+ {{ task.get_status_display }}
+
+
+
+ 创建时间:
+ {{ task.created_at|date:"Y-m-d H:i:s" }}
+
+ {% if task.started_at %}
+
+ 开始时间:
+ {{ task.started_at|date:"Y-m-d H:i:s" }}
+
+ {% endif %}
+ {% if task.completed_at %}
+
+ 完成时间:
+ {{ task.completed_at|date:"Y-m-d H:i:s" }}
+
+ {% endif %}
+ {% if task.get_duration %}
+
+ 执行时长:
+ {{ task.duration_display }}
+
+ {% endif %}
+
+
+
+
+
统计信息
+
+
+
{{ task.total_articles }}
+
总文章数
+
+
+
{{ task.success_count }}
+
成功数
+
+
+
{{ task.failed_count }}
+
失败数
+
+ {% if task.total_articles > 0 %}
+
+
+ {% widthratio task.success_count task.total_articles 100 %}%
+
+
成功率
+
+ {% endif %}
+
+
+
+{% if task.keyword %}
+
+
任务配置
+
+
+ 搜索关键词:
+ {{ task.keyword }}
+
+
+ 目标网站:
+ {{ task.get_websites_display }}
+
+ {% if task.start_date %}
+
+ 开始日期:
+ {{ task.start_date }}
+
+ {% endif %}
+ {% if task.end_date %}
+
+ 结束日期:
+ {{ task.end_date }}
+
+ {% endif %}
+
+ 最大页数:
+ {{ task.max_pages }}
+
+
+ 最大文章数:
+ {{ task.max_articles }}
+
+
+
+{% endif %}
+
+{% if task.current_website or task.current_action %}
+
+
当前状态
+ {% if task.current_website %}
+
+ 当前网站: {{ task.current_website }}
+
+ {% endif %}
+ {% if task.current_action %}
+
+ 当前操作: {{ task.current_action }}
+
+ {% endif %}
+ {% if task.status == 'running' %}
+
+
+
+ {{ task.progress }}%
+
+
+
+ {% endif %}
+
+{% endif %}
+
+{% if task.error_message %}
+
+
错误信息
+
{{ task.error_message }}
+
+{% endif %}
+
+{% if task.result_details %}
+
+
详细结果
+ {% for website, result in task.result_details.items %}
+
+
{{ website }}:
+
+ - 找到链接: {{ result.found_urls }}
+ - 已处理: {{ result.processed }}
+ - 成功: {{ result.success }}
+ - 失败: {{ result.failed }}
+ {% if result.error %}
+ - 错误: {{ result.error }}
+ {% endif %}
+
+
+ {% endfor %}
+
+{% endif %}
+
+
+
+{% endblock %}
diff --git a/core/templatetags/__init__.py b/core/templatetags/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/core/templatetags/core_extras.py b/core/templatetags/core_extras.py
new file mode 100644
index 0000000..05b3996
--- /dev/null
+++ b/core/templatetags/core_extras.py
@@ -0,0 +1,46 @@
+from django import template
+from django.core.cache import cache
+from core.models import CrawlTask
+
+register = template.Library()
+
+
+@register.simple_tag
+def get_recent_tasks(limit=5):
+ """获取最近的任务"""
+ cache_key = f'recent_tasks_{limit}'
+ recent_tasks = cache.get(cache_key)
+
+ if recent_tasks is None:
+ recent_tasks = CrawlTask.objects.all()[:limit]
+ cache.set(cache_key, recent_tasks, 60) # 缓存1分钟
+
+ return recent_tasks
+
+
+@register.filter
+def task_status_color(status):
+ """根据任务状态返回颜色"""
+ color_map = {
+ 'pending': 'gray',
+ 'running': 'blue',
+ 'completed': 'green',
+ 'failed': 'red',
+ 'cancelled': 'orange',
+ }
+ return color_map.get(status, 'gray')
+
+
+@register.filter
+def task_progress_bar(progress):
+ """生成进度条HTML"""
+ if progress is None:
+ progress = 0
+
+ return f'''
+
+ '''
diff --git a/core/utils.py b/core/utils.py
index 1dbac76..09b488c 100644
--- a/core/utils.py
+++ b/core/utils.py
@@ -1,7 +1,7 @@
import os
import requests
from bs4 import BeautifulSoup
-from urllib.parse import urljoin, urlparse
+from urllib.parse import urljoin, urlparse, quote
from collections import deque
from django.utils import timezone
from django.conf import settings
@@ -15,6 +15,8 @@ from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
+from datetime import datetime, timedelta
+import json
def get_selenium_driver():
@@ -2270,3 +2272,695 @@ def full_site_crawler(start_url, website, max_pages=1000):
queue.append(href)
elif href not in visited and is_valid_url(href, base_netloc):
queue.append(href)
+
+
+# 网站搜索配置
+WEBSITE_SEARCH_CONFIGS = {
+ "新华网": {
+ "search_url": "http://so.news.cn/getNews",
+ "search_params": {
+ "keyword": "{keyword}",
+ "curPage": "{page}",
+ "sortField": "0",
+ "sortType": "1"
+ },
+ "method": "post",
+ "headers": {
+ "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
+ }
+ },
+ "人民日报": {
+ "search_url": "http://search.people.com.cn/search",
+ "search_params": {
+ "keyword": "{keyword}",
+ "st": "0",
+ "startDate": "{start_date}",
+ "endDate": "{end_date}",
+ "page": "{page}"
+ },
+ "method": "get"
+ },
+ "央视网": {
+ "search_url": "https://search.cctv.com/search.php",
+ "search_params": {
+ "qtext": "{keyword}",
+ "type": "web",
+ "page": "{page}"
+ },
+ "method": "get"
+ },
+ "光明日报": {
+ "search_url": "http://search.gmw.cn/search",
+ "search_params": {
+ "q": "{keyword}",
+ "page": "{page}"
+ },
+ "method": "get"
+ },
+ "经济日报": {
+ "search_url": "http://www.ce.cn/search",
+ "search_params": {
+ "keyword": "{keyword}",
+ "page": "{page}"
+ },
+ "method": "get"
+ },
+ "中国日报": {
+ "search_url": "http://www.chinadaily.com.cn/search",
+ "search_params": {
+ "q": "{keyword}",
+ "page": "{page}"
+ },
+ "method": "get"
+ },
+ "工人日报": {
+ "search_url": "https://www.workercn.cn/search",
+ "search_params": {
+ "keyword": "{keyword}",
+ "page": "{page}"
+ },
+ "method": "get"
+ },
+ "科技日报": {
+ "search_url": "http://www.stdaily.com/search",
+ "search_params": {
+ "keyword": "{keyword}",
+ "page": "{page}"
+ },
+ "method": "get"
+ },
+ "人民政协网": {
+ "search_url": "https://www.rmzxw.com.cn/search",
+ "search_params": {
+ "keyword": "{keyword}",
+ "page": "{page}"
+ },
+ "method": "get"
+ },
+ "中国纪检监察报": {
+ "search_url": "http://www.jjjcb.cn/search",
+ "search_params": {
+ "keyword": "{keyword}",
+ "page": "{page}"
+ },
+ "method": "get"
+ },
+ "中国新闻社": {
+ "search_url": "https://www.chinanews.com.cn/search",
+ "search_params": {
+ "keyword": "{keyword}",
+ "page": "{page}"
+ },
+ "method": "get"
+ },
+ "学习时报": {
+ "search_url": "https://www.studytimes.cn/search",
+ "search_params": {
+ "keyword": "{keyword}",
+ "page": "{page}"
+ },
+ "method": "get"
+ },
+ "中国青年报": {
+ "search_url": "http://news.cyol.com/search",
+ "search_params": {
+ "keyword": "{keyword}",
+ "page": "{page}"
+ },
+ "method": "get"
+ },
+ "中国妇女报": {
+ "search_url": "https://www.cnwomen.com.cn/search",
+ "search_params": {
+ "keyword": "{keyword}",
+ "page": "{page}"
+ },
+ "method": "get"
+ },
+ "法治日报": {
+ "search_url": "http://www.legaldaily.com.cn/search",
+ "search_params": {
+ "keyword": "{keyword}",
+ "page": "{page}"
+ },
+ "method": "get"
+ },
+ "农民日报": {
+ "search_url": "https://www.farmer.com.cn/search",
+ "search_params": {
+ "keyword": "{keyword}",
+ "page": "{page}"
+ },
+ "method": "get"
+ },
+ "学习强国": {
+ "search_url": "https://www.xuexi.cn/search",
+ "search_params": {
+ "keyword": "{keyword}",
+ "page": "{page}"
+ },
+ "method": "get"
+ },
+ "旗帜网": {
+ "search_url": "http://www.qizhiwang.org.cn/search",
+ "search_params": {
+ "keyword": "{keyword}",
+ "page": "{page}"
+ },
+ "method": "get"
+ },
+ "中国网": {
+ "search_url": "http://www.china.com.cn/search",
+ "search_params": {
+ "keyword": "{keyword}",
+ "page": "{page}"
+ },
+ "method": "get"
+ },
+ "中国政府网": {
+ "search_url": "https://www.gov.cn/search",
+ "search_params": {
+ "keyword": "{keyword}",
+ "page": "{page}"
+ },
+ "method": "get"
+ },
+ "求是网": {
+ "search_url": "http://www.qstheory.cn/search",
+ "search_params": {
+ "keyword": "{keyword}",
+ "page": "{page}"
+ },
+ "method": "get"
+ },
+ "解放军报": {
+ "search_url": "http://www.81.cn/search",
+ "search_params": {
+ "keyword": "{keyword}",
+ "page": "{page}"
+ },
+ "method": "get"
+ }
+}
+
+
+def search_articles_by_keyword(website_name, keyword, max_pages=10, start_date=None, end_date=None):
+ """
+ 根据关键词搜索文章
+
+ Args:
+ website_name: 网站名称
+ keyword: 搜索关键词
+ max_pages: 最大搜索页数
+ start_date: 开始日期 (YYYY-MM-DD)
+ end_date: 结束日期 (YYYY-MM-DD)
+
+ Returns:
+ list: 搜索到的文章URL列表
+ """
+ if website_name not in WEBSITE_SEARCH_CONFIGS:
+ print(f"网站 {website_name} 不支持搜索功能")
+ return []
+
+ config = WEBSITE_SEARCH_CONFIGS[website_name]
+ article_urls = []
+
+ # 设置默认日期范围
+ if not start_date:
+ start_date = (datetime.now() - timedelta(days=365)).strftime("%Y-%m-%d")
+ if not end_date:
+ end_date = datetime.now().strftime("%Y-%m-%d")
+
+ headers = {
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
+ }
+ headers.update(config.get("headers", {}))
+
+ for page in range(1, max_pages + 1):
+ try:
+ # 构建搜索参数
+ search_params = {}
+ for key, value in config["search_params"].items():
+ search_params[key] = value.format(
+ keyword=quote(keyword),
+ page=page,
+ start_date=start_date,
+ end_date=end_date
+ )
+
+ print(f"搜索 {website_name} 第 {page} 页: {keyword}")
+
+ if config["method"] == "post":
+ response = requests.post(
+ config["search_url"],
+ data=search_params,
+ headers=headers,
+ timeout=15
+ )
+ else:
+ response = requests.get(
+ config["search_url"],
+ params=search_params,
+ headers=headers,
+ timeout=15
+ )
+
+ response.raise_for_status()
+ response.encoding = 'utf-8'
+
+ # 解析搜索结果
+ soup = BeautifulSoup(response.text, "html.parser")
+ page_urls = extract_search_results(soup, website_name)
+
+ if not page_urls:
+ print(f"第 {page} 页没有找到更多结果")
+ break
+
+ article_urls.extend(page_urls)
+ print(f"第 {page} 页找到 {len(page_urls)} 篇文章")
+
+ # 避免请求过快
+ time.sleep(1)
+
+ except Exception as e:
+ print(f"搜索第 {page} 页时出错: {e}")
+ continue
+
+ print(f"总共找到 {len(article_urls)} 篇文章")
+ return article_urls
+
+
+def extract_search_results(soup, website_name):
+ """
+ 从搜索结果页面提取文章链接
+
+ Args:
+ soup: BeautifulSoup对象
+ website_name: 网站名称
+
+ Returns:
+ list: 文章URL列表
+ """
+ urls = []
+
+ # 根据不同网站的搜索结果结构提取链接
+ if website_name == "新华网":
+ # 新华网搜索结果结构
+ for link in soup.find_all("a", href=True):
+ href = link["href"]
+ if "/news/" in href or "/article/" in href:
+ urls.append(href)
+
+ elif website_name == "人民日报":
+ # 人民日报搜索结果结构
+ for link in soup.find_all("a", href=True):
+ href = link["href"]
+ if "/n1/" in href or "/article/" in href:
+ urls.append(href)
+
+ elif website_name == "央视网":
+ # 央视网搜索结果结构
+ for link in soup.find_all("a", href=True):
+ href = link["href"]
+ if "/news/" in href or "ARTI" in href:
+ urls.append(href)
+
+ elif website_name == "光明日报":
+ # 光明日报搜索结果结构
+ for link in soup.find_all("a", href=True):
+ href = link["href"]
+ if "/article/" in href or "/content/" in href:
+ urls.append(href)
+
+ elif website_name == "经济日报":
+ # 经济日报搜索结果结构
+ for link in soup.find_all("a", href=True):
+ href = link["href"]
+ if "/article/" in href or "/content/" in href:
+ urls.append(href)
+
+ elif website_name == "中国日报":
+ # 中国日报搜索结果结构
+ for link in soup.find_all("a", href=True):
+ href = link["href"]
+ if "/article/" in href or "/content/" in href:
+ urls.append(href)
+
+ elif website_name == "工人日报":
+ # 工人日报搜索结果结构
+ for link in soup.find_all("a", href=True):
+ href = link["href"]
+ if "/c/" in href or "/article/" in href:
+ urls.append(href)
+
+ elif website_name == "科技日报":
+ # 科技日报搜索结果结构
+ for link in soup.find_all("a", href=True):
+ href = link["href"]
+ if "/article/" in href or "/content/" in href:
+ urls.append(href)
+
+ elif website_name == "人民政协网":
+ # 人民政协网搜索结果结构
+ for link in soup.find_all("a", href=True):
+ href = link["href"]
+ if "/article/" in href or "/content/" in href:
+ urls.append(href)
+
+ elif website_name == "中国纪检监察报":
+ # 中国纪检监察报搜索结果结构
+ for link in soup.find_all("a", href=True):
+ href = link["href"]
+ if "/article/" in href or "/content/" in href:
+ urls.append(href)
+
+ elif website_name == "中国新闻社":
+ # 中国新闻社搜索结果结构
+ for link in soup.find_all("a", href=True):
+ href = link["href"]
+ if "/article/" in href or "/content/" in href:
+ urls.append(href)
+
+ elif website_name == "学习时报":
+ # 学习时报搜索结果结构
+ for link in soup.find_all("a", href=True):
+ href = link["href"]
+ if "/article/" in href or "/content/" in href:
+ urls.append(href)
+
+ elif website_name == "中国青年报":
+ # 中国青年报搜索结果结构
+ for link in soup.find_all("a", href=True):
+ href = link["href"]
+ if "/article/" in href or "/content/" in href:
+ urls.append(href)
+
+ elif website_name == "中国妇女报":
+ # 中国妇女报搜索结果结构
+ for link in soup.find_all("a", href=True):
+ href = link["href"]
+ if "/article/" in href or "/content/" in href:
+ urls.append(href)
+
+ elif website_name == "法治日报":
+ # 法治日报搜索结果结构
+ for link in soup.find_all("a", href=True):
+ href = link["href"]
+ if "/content/" in href and "content_" in href:
+ urls.append(href)
+
+ elif website_name == "农民日报":
+ # 农民日报搜索结果结构
+ for link in soup.find_all("a", href=True):
+ href = link["href"]
+ if "/article/" in href or "/content/" in href:
+ urls.append(href)
+
+ elif website_name == "学习强国":
+ # 学习强国搜索结果结构
+ for link in soup.find_all("a", href=True):
+ href = link["href"]
+ if "/article/" in href or "/content/" in href:
+ urls.append(href)
+
+ elif website_name == "旗帜网":
+ # 旗帜网搜索结果结构
+ for link in soup.find_all("a", href=True):
+ href = link["href"]
+ if "/n1/" in href or "/article/" in href:
+ urls.append(href)
+
+ elif website_name == "中国网":
+ # 中国网搜索结果结构
+ for link in soup.find_all("a", href=True):
+ href = link["href"]
+ if "/article/" in href or "/opinion/" in href:
+ urls.append(href)
+
+ elif website_name == "中国政府网":
+ # 中国政府网搜索结果结构
+ for link in soup.find_all("a", href=True):
+ href = link["href"]
+ if "/zhengce/" in href or "/xinwen/" in href:
+ urls.append(href)
+
+ elif website_name == "求是网":
+ # 求是网搜索结果结构
+ for link in soup.find_all("a", href=True):
+ href = link["href"]
+ if "/article/" in href or "/content/" in href:
+ urls.append(href)
+
+ elif website_name == "解放军报":
+ # 解放军报搜索结果结构
+ for link in soup.find_all("a", href=True):
+ href = link["href"]
+ if "/zt/" in href or "/article/" in href:
+ urls.append(href)
+
+ # 去重并返回
+ return list(set(urls))
+
+
+def crawl_by_keyword(keyword, website_names=None, max_pages=10, start_date=None, end_date=None, max_articles=100):
+ """
+ 根据关键词爬取多个网站的文章
+
+ Args:
+ keyword: 搜索关键词
+ website_names: 网站名称列表,如果为None则爬取所有支持的网站
+ max_pages: 每个网站最大搜索页数
+ start_date: 开始日期 (YYYY-MM-DD)
+ end_date: 结束日期 (YYYY-MM-DD)
+ max_articles: 最大文章数量
+
+ Returns:
+ dict: 爬取结果统计
+ """
+ if website_names is None:
+ website_names = list(WEBSITE_SEARCH_CONFIGS.keys())
+
+ results = {
+ "keyword": keyword,
+ "total_articles": 0,
+ "success_count": 0,
+ "failed_count": 0,
+ "website_results": {}
+ }
+
+ print(f"开始根据关键词 '{keyword}' 爬取文章...")
+ print(f"目标网站: {', '.join(website_names)}")
+
+ for website_name in website_names:
+ print(f"\n开始爬取 {website_name}...")
+
+ try:
+ # 获取或创建网站对象
+ from core.models import Website
+ website, created = Website.objects.get_or_create(
+ name=website_name,
+ defaults={
+ 'base_url': WEBSITE_SEARCH_CONFIGS[website_name]["search_url"],
+ 'enabled': True
+ }
+ )
+
+ # 搜索文章URL
+ article_urls = search_articles_by_keyword(
+ website_name, keyword, max_pages, start_date, end_date
+ )
+
+ if not article_urls:
+ print(f"{website_name} 没有找到相关文章")
+ results["website_results"][website_name] = {
+ "found_urls": 0,
+ "processed": 0,
+ "success": 0,
+ "failed": 0
+ }
+ continue
+
+ # 限制文章数量
+ if len(article_urls) > max_articles:
+ article_urls = article_urls[:max_articles]
+
+ print(f"{website_name} 找到 {len(article_urls)} 篇文章,开始处理...")
+
+ website_success = 0
+ website_failed = 0
+
+ for i, url in enumerate(article_urls, 1):
+ try:
+ print(f"处理第 {i}/{len(article_urls)} 篇: {url}")
+ process_article(url, website)
+ website_success += 1
+ results["success_count"] += 1
+
+ # 避免请求过快
+ time.sleep(0.5)
+
+ except Exception as e:
+ print(f"处理文章失败: {url}, 错误: {e}")
+ website_failed += 1
+ results["failed_count"] += 1
+
+ results["website_results"][website_name] = {
+ "found_urls": len(article_urls),
+ "processed": len(article_urls),
+ "success": website_success,
+ "failed": website_failed
+ }
+
+ print(f"{website_name} 完成: 成功 {website_success}, 失败 {website_failed}")
+
+ except Exception as e:
+ print(f"爬取 {website_name} 时出错: {e}")
+ results["website_results"][website_name] = {
+ "found_urls": 0,
+ "processed": 0,
+ "success": 0,
+ "failed": 1,
+ "error": str(e)
+ }
+ results["failed_count"] += 1
+
+ results["total_articles"] = results["success_count"] + results["failed_count"]
+
+ print(f"\n爬取完成!")
+ print(f"关键词: {keyword}")
+ print(f"总文章数: {results['total_articles']}")
+ print(f"成功: {results['success_count']}")
+ print(f"失败: {results['failed_count']}")
+
+ return results
+
+
+def crawl_historical_articles(website_names=None, start_date=None, end_date=None, max_articles_per_site=50):
+ """
+ 爬取历史文章
+
+ Args:
+ website_names: 网站名称列表
+ start_date: 开始日期 (YYYY-MM-DD)
+ end_date: 结束日期 (YYYY-MM-DD)
+ max_articles_per_site: 每个网站最大文章数
+
+ Returns:
+ dict: 爬取结果统计
+ """
+ if not start_date:
+ start_date = (datetime.now() - timedelta(days=30)).strftime("%Y-%m-%d")
+ if not end_date:
+ end_date = datetime.now().strftime("%Y-%m-%d")
+
+ if website_names is None:
+ website_names = list(WEBSITE_SEARCH_CONFIGS.keys())
+
+ results = {
+ "start_date": start_date,
+ "end_date": end_date,
+ "total_articles": 0,
+ "success_count": 0,
+ "failed_count": 0,
+ "website_results": {}
+ }
+
+ print(f"开始爬取历史文章...")
+ print(f"日期范围: {start_date} 到 {end_date}")
+ print(f"目标网站: {', '.join(website_names)}")
+
+ # 使用通用关键词搜索历史文章
+ common_keywords = ["新闻", "报道", "文章", "资讯", "动态"]
+
+ for website_name in website_names:
+ print(f"\n开始爬取 {website_name} 历史文章...")
+
+ try:
+ from core.models import Website
+ website, created = Website.objects.get_or_create(
+ name=website_name,
+ defaults={
+ 'base_url': WEBSITE_SEARCH_CONFIGS[website_name]["search_url"],
+ 'enabled': True
+ }
+ )
+
+ website_success = 0
+ website_failed = 0
+ all_urls = set()
+
+ # 使用多个关键词搜索
+ for keyword in common_keywords:
+ try:
+ article_urls = search_articles_by_keyword(
+ website_name, keyword, max_pages=5,
+ start_date=start_date, end_date=end_date
+ )
+ all_urls.update(article_urls)
+
+ if len(all_urls) >= max_articles_per_site:
+ break
+
+ except Exception as e:
+ print(f"搜索关键词 '{keyword}' 时出错: {e}")
+ continue
+
+ # 限制文章数量
+ article_urls = list(all_urls)[:max_articles_per_site]
+
+ if not article_urls:
+ print(f"{website_name} 没有找到历史文章")
+ results["website_results"][website_name] = {
+ "found_urls": 0,
+ "processed": 0,
+ "success": 0,
+ "failed": 0
+ }
+ continue
+
+ print(f"{website_name} 找到 {len(article_urls)} 篇历史文章,开始处理...")
+
+ for i, url in enumerate(article_urls, 1):
+ try:
+ print(f"处理第 {i}/{len(article_urls)} 篇: {url}")
+ process_article(url, website)
+ website_success += 1
+ results["success_count"] += 1
+
+ time.sleep(0.5)
+
+ except Exception as e:
+ print(f"处理文章失败: {url}, 错误: {e}")
+ website_failed += 1
+ results["failed_count"] += 1
+
+ results["website_results"][website_name] = {
+ "found_urls": len(article_urls),
+ "processed": len(article_urls),
+ "success": website_success,
+ "failed": website_failed
+ }
+
+ print(f"{website_name} 完成: 成功 {website_success}, 失败 {website_failed}")
+
+ except Exception as e:
+ print(f"爬取 {website_name} 历史文章时出错: {e}")
+ results["website_results"][website_name] = {
+ "found_urls": 0,
+ "processed": 0,
+ "success": 0,
+ "failed": 1,
+ "error": str(e)
+ }
+ results["failed_count"] += 1
+
+ results["total_articles"] = results["success_count"] + results["failed_count"]
+
+ print(f"\n历史文章爬取完成!")
+ print(f"日期范围: {start_date} 到 {end_date}")
+ print(f"总文章数: {results['total_articles']}")
+ print(f"成功: {results['success_count']}")
+ print(f"失败: {results['failed_count']}")
+
+ return results