diff --git a/core/admin_extended.py b/core/admin_extended.py index c496a07..feb5f4a 100644 --- a/core/admin_extended.py +++ b/core/admin_extended.py @@ -16,9 +16,10 @@ from django.utils import timezone from django.db.models import Count, Q from django.core.cache import cache -from .models import Website, Article +from .models import Website, Article, CrawlTask from .tasks import crawl_website, crawl_all_websites, cleanup_old_articles from .distributed_crawler import distributed_crawler +from .task_executor import task_executor logger = logging.getLogger(__name__) @@ -411,6 +412,419 @@ class ArticleAdmin(admin.ModelAdmin): actions_column.short_description = '操作' +class CrawlTaskStatusFilter(SimpleListFilter): + """爬取任务状态过滤器""" + title = '任务状态' + parameter_name = 'status' + + def lookups(self, request, model_admin): + return ( + ('pending', '等待中'), + ('running', '运行中'), + ('completed', '已完成'), + ('failed', '失败'), + ('cancelled', '已取消'), + ) + + def queryset(self, request, queryset): + if self.value(): + return queryset.filter(status=self.value()) + return queryset + + +class CrawlTaskTypeFilter(SimpleListFilter): + """爬取任务类型过滤器""" + title = '任务类型' + parameter_name = 'task_type' + + def lookups(self, request, model_admin): + return ( + ('keyword', '关键词搜索'), + ('historical', '历史文章'), + ('full_site', '全站爬取'), + ) + + def queryset(self, request, queryset): + if self.value(): + return queryset.filter(task_type=self.value()) + return queryset + + +class CrawlTaskAdmin(admin.ModelAdmin): + """爬取任务管理""" + list_display = [ + 'name', 'task_type', 'keyword', 'websites_display', 'status', + 'progress_display', 'created_at', 'duration_display', 'actions_column' + ] + list_filter = [CrawlTaskStatusFilter, CrawlTaskTypeFilter, 'created_at'] + search_fields = ['name', 'keyword', 'created_by'] + readonly_fields = [ + 'status', 'progress', 'current_website', 'current_action', + 'total_articles', 'success_count', 'failed_count', + 'created_at', 'started_at', 'completed_at', 'error_message', + 'result_details', 'duration_display', 'progress_display' + ] + actions = ['start_tasks', 'cancel_tasks', 'delete_completed_tasks'] + + class Media: + js = ('admin/js/crawl_task_actions.js',) + + fieldsets = ( + ('基本信息', { + 'fields': ('name', 'task_type', 'keyword') + }), + ('爬取配置', { + 'fields': ('websites', 'start_date', 'end_date', 'max_pages', 'max_articles') + }), + ('任务状态', { + 'fields': ('status', 'progress_display', 'current_website', 'current_action'), + 'classes': ('collapse',) + }), + ('统计信息', { + 'fields': ('total_articles', 'success_count', 'failed_count'), + 'classes': ('collapse',) + }), + ('时间信息', { + 'fields': ('created_at', 'started_at', 'completed_at', 'duration_display'), + 'classes': ('collapse',) + }), + ('错误信息', { + 'fields': ('error_message',), + 'classes': ('collapse',) + }), + ('结果详情', { + 'fields': ('result_details',), + 'classes': ('collapse',) + }), + ) + + def websites_display(self, obj): + """网站列表显示""" + return obj.get_websites_display() + websites_display.short_description = '目标网站' + + def progress_display(self, obj): + """进度显示""" + if obj.status == 'running': + return format_html( + '
' + '
{}%
' + '
', + obj.progress, obj.progress + ) + elif obj.status == 'completed': + return format_html('✓ 完成') + elif obj.status == 'failed': + return format_html('✗ 失败') + elif obj.status == 'cancelled': + return format_html('⊘ 已取消') + else: + return format_html('⏳ 等待') + progress_display.short_description = '进度' + + def duration_display(self, obj): + """执行时长显示""" + duration = obj.get_duration() + if duration: + total_seconds = int(duration.total_seconds()) + hours = total_seconds // 3600 + minutes = (total_seconds % 3600) // 60 + seconds = total_seconds % 60 + if hours > 0: + return f"{hours}小时{minutes}分钟" + elif minutes > 0: + return f"{minutes}分钟{seconds}秒" + else: + return f"{seconds}秒" + return "-" + duration_display.short_description = '执行时长' + + def actions_column(self, obj): + """操作列""" + actions = [] + + if obj.status == 'pending': + actions.append(f'开始') + + if obj.can_cancel(): + actions.append(f'取消') + + if obj.status == 'completed': + actions.append(f'查看结果') + + return format_html(' '.join(actions)) + actions_column.short_description = '操作' + + def start_tasks(self, request, queryset): + """启动选中的任务""" + started_count = 0 + for task in queryset.filter(status='pending'): + try: + success, message = task_executor.start_task(task.id) + if success: + started_count += 1 + else: + self.message_user(request, f'启动任务 {task.name} 失败: {message}', messages.ERROR) + except Exception as e: + self.message_user(request, f'启动任务 {task.name} 失败: {e}', messages.ERROR) + + if started_count > 0: + self.message_user(request, f'成功启动 {started_count} 个任务', messages.SUCCESS) + start_tasks.short_description = '启动选中的任务' + + def cancel_tasks(self, request, queryset): + """取消选中的任务""" + cancelled_count = 0 + for task in queryset.filter(status__in=['pending', 'running']): + try: + success, message = task_executor.cancel_task(task.id) + if success: + cancelled_count += 1 + else: + self.message_user(request, f'取消任务 {task.name} 失败: {message}', messages.ERROR) + except Exception as e: + self.message_user(request, f'取消任务 {task.name} 失败: {e}', messages.ERROR) + + if cancelled_count > 0: + self.message_user(request, f'成功取消 {cancelled_count} 个任务', messages.SUCCESS) + cancel_tasks.short_description = '取消选中的任务' + + def delete_completed_tasks(self, request, queryset): + """删除已完成的任务""" + completed_tasks = queryset.filter(status__in=['completed', 'failed', 'cancelled']) + count = completed_tasks.count() + completed_tasks.delete() + + if count > 0: + self.message_user(request, f'成功删除 {count} 个已完成的任务', messages.SUCCESS) + delete_completed_tasks.short_description = '删除已完成的任务' + + def get_urls(self): + """添加自定义URL""" + urls = super().get_urls() + custom_urls = [ + path( + 'create-keyword-task/', + self.admin_site.admin_view(self.create_keyword_task_view), + name='create_keyword_task', + ), + path( + 'create-historical-task/', + self.admin_site.admin_view(self.create_historical_task_view), + name='create_historical_task', + ), + path( + 'create-full-site-task/', + self.admin_site.admin_view(self.create_full_site_task_view), + name='create_full_site_task', + ), + path( + '/start/', + self.admin_site.admin_view(self.start_task_view), + name='start_task', + ), + path( + '/cancel/', + self.admin_site.admin_view(self.cancel_task_view), + name='cancel_task', + ), + path( + '/results/', + self.admin_site.admin_view(self.view_results_view), + name='view_results', + ), + ] + return custom_urls + urls + + def create_keyword_task_view(self, request): + """创建关键词搜索任务视图""" + if request.method == 'POST': + try: + from .utils import WEBSITE_SEARCH_CONFIGS + + name = request.POST.get('name', '') + keyword = request.POST.get('keyword', '') + websites = request.POST.getlist('websites') + start_date = request.POST.get('start_date') + end_date = request.POST.get('end_date') + max_pages = int(request.POST.get('max_pages', 10)) + max_articles = int(request.POST.get('max_articles', 100)) + + if not name or not keyword: + self.message_user(request, '任务名称和关键词不能为空', messages.ERROR) + return HttpResponseRedirect(reverse('admin:core_crawltask_changelist')) + + # 创建任务 + task = CrawlTask.objects.create( + name=name, + task_type='keyword', + keyword=keyword, + start_date=start_date if start_date else None, + end_date=end_date if end_date else None, + max_pages=max_pages, + max_articles=max_articles, + created_by=request.user.username if request.user.is_authenticated else 'admin' + ) + + # 添加选择的网站 + if websites: + website_objects = Website.objects.filter(name__in=websites) + task.websites.set(website_objects) + + self.message_user(request, f'关键词搜索任务 "{name}" 创建成功', messages.SUCCESS) + return HttpResponseRedirect(reverse('admin:core_crawltask_change', args=[task.id])) + + except Exception as e: + self.message_user(request, f'创建任务失败: {e}', messages.ERROR) + + # GET请求,显示创建表单 + context = { + 'websites': Website.objects.filter(enabled=True), + 'title': '创建关键词搜索任务' + } + return admin.site.admin_view(self.render_create_task_template)(request, 'admin/create_keyword_task.html', context) + + def create_historical_task_view(self, request): + """创建历史文章任务视图""" + if request.method == 'POST': + try: + from .utils import WEBSITE_SEARCH_CONFIGS + + name = request.POST.get('name', '') + websites = request.POST.getlist('websites') + start_date = request.POST.get('start_date') + end_date = request.POST.get('end_date') + max_articles = int(request.POST.get('max_articles', 50)) + + if not name: + self.message_user(request, '任务名称不能为空', messages.ERROR) + return HttpResponseRedirect(reverse('admin:core_crawltask_changelist')) + + # 创建任务 + task = CrawlTask.objects.create( + name=name, + task_type='historical', + keyword='历史文章', + start_date=start_date if start_date else None, + end_date=end_date if end_date else None, + max_articles=max_articles, + created_by=request.user.username if request.user.is_authenticated else 'admin' + ) + + # 添加选择的网站 + if websites: + website_objects = Website.objects.filter(name__in=websites) + task.websites.set(website_objects) + + self.message_user(request, f'历史文章任务 "{name}" 创建成功', messages.SUCCESS) + return HttpResponseRedirect(reverse('admin:core_crawltask_change', args=[task.id])) + + except Exception as e: + self.message_user(request, f'创建任务失败: {e}', messages.ERROR) + + # GET请求,显示创建表单 + context = { + 'websites': Website.objects.filter(enabled=True), + 'title': '创建历史文章任务' + } + return admin.site.admin_view(self.render_create_task_template)(request, 'admin/create_historical_task.html', context) + + def create_full_site_task_view(self, request): + """创建全站爬取任务视图""" + if request.method == 'POST': + try: + from .utils import WEBSITE_SEARCH_CONFIGS + + name = request.POST.get('name', '') + websites = request.POST.getlist('websites') + max_pages = int(request.POST.get('max_pages', 500)) + + if not name: + self.message_user(request, '任务名称不能为空', messages.ERROR) + return HttpResponseRedirect(reverse('admin:core_crawltask_changelist')) + + # 创建任务 + task = CrawlTask.objects.create( + name=name, + task_type='full_site', + keyword='全站爬取', + max_pages=max_pages, + created_by=request.user.username if request.user.is_authenticated else 'admin' + ) + + # 添加选择的网站 + if websites: + website_objects = Website.objects.filter(name__in=websites) + task.websites.set(website_objects) + + self.message_user(request, f'全站爬取任务 "{name}" 创建成功', messages.SUCCESS) + return HttpResponseRedirect(reverse('admin:core_crawltask_change', args=[task.id])) + + except Exception as e: + self.message_user(request, f'创建任务失败: {e}', messages.ERROR) + + # GET请求,显示创建表单 + context = { + 'websites': Website.objects.filter(enabled=True), + 'title': '创建全站爬取任务' + } + return admin.site.admin_view(self.render_create_task_template)(request, 'admin/create_full_site_task.html', context) + + def start_task_view(self, request, task_id): + """启动任务视图""" + try: + success, message = task_executor.start_task(task_id) + if success: + self.message_user(request, f'任务已启动: {message}', messages.SUCCESS) + else: + self.message_user(request, f'启动任务失败: {message}', messages.ERROR) + except Exception as e: + self.message_user(request, f'启动任务失败: {e}', messages.ERROR) + + return HttpResponseRedirect(reverse('admin:core_crawltask_changelist')) + + def cancel_task_view(self, request, task_id): + """取消任务视图""" + try: + success, message = task_executor.cancel_task(task_id) + if success: + self.message_user(request, f'任务已取消: {message}', messages.SUCCESS) + else: + self.message_user(request, f'取消任务失败: {message}', messages.ERROR) + except Exception as e: + self.message_user(request, f'取消任务失败: {e}', messages.ERROR) + + return HttpResponseRedirect(reverse('admin:core_crawltask_changelist')) + + def view_results_view(self, request, task_id): + """查看结果视图""" + try: + task = CrawlTask.objects.get(id=task_id) + context = { + 'task': task, + 'title': f'任务结果 - {task.name}' + } + return admin.site.admin_view(self.render_create_task_template)(request, 'admin/task_results.html', context) + except CrawlTask.DoesNotExist: + self.message_user(request, '任务不存在', messages.ERROR) + return HttpResponseRedirect(reverse('admin:core_crawltask_changelist')) + + def render_create_task_template(self, request, template_name, context): + """渲染创建任务模板""" + from django.template.loader import render_to_string + from django.http import HttpResponse + + context.update({ + 'site_header': admin.site.site_header, + 'site_title': admin.site.site_title, + 'has_permission': True, + 'user': request.user, + }) + + html = render_to_string(template_name, context) + return HttpResponse(html) + + #class CrawlerStatusAdmin(admin.ModelAdmin): # """爬虫状态管理""" # change_list_template = 'admin/crawler_status.html' @@ -448,6 +862,7 @@ class ArticleAdmin(admin.ModelAdmin): # 注册管理类 admin.site.register(Website, WebsiteAdmin) admin.site.register(Article, ArticleAdmin) +admin.site.register(CrawlTask, CrawlTaskAdmin) # 隐藏Celery Results管理功能 diff --git a/core/management/commands/crawl_all_websites.py b/core/management/commands/crawl_all_websites.py new file mode 100644 index 0000000..25e2b0f --- /dev/null +++ b/core/management/commands/crawl_all_websites.py @@ -0,0 +1,257 @@ +from django.core.management.base import BaseCommand +from core.utils import full_site_crawler, crawl_by_keyword, WEBSITE_SEARCH_CONFIGS +from core.models import Website +import json + + +class Command(BaseCommand): + help = "一键爬取所有支持的网站" + + def add_arguments(self, parser): + parser.add_argument( + '--mode', '-m', + type=str, + choices=['full', 'keyword', 'both'], + default='full', + help='爬取模式: full(全站爬取), keyword(关键词爬取), both(两种模式)' + ) + + parser.add_argument( + '--keyword', '-k', + type=str, + help='关键词搜索模式下的搜索关键词' + ) + + parser.add_argument( + '--websites', '-w', + type=str, + nargs='*', + help='指定要爬取的网站名称列表,如果不指定则爬取所有支持的网站' + ) + + parser.add_argument( + '--max-pages', '-p', + type=int, + default=500, + help='全站爬取最大页数 (默认: 500)' + ) + + parser.add_argument( + '--max-search-pages', '-sp', + type=int, + default=10, + help='关键词搜索最大页数 (默认: 10)' + ) + + parser.add_argument( + '--max-articles', '-a', + type=int, + default=100, + help='关键词搜索最大文章数量 (默认: 100)' + ) + + parser.add_argument( + '--start-date', '-s', + type=str, + help='开始日期 (格式: YYYY-MM-DD)' + ) + + parser.add_argument( + '--end-date', '-e', + type=str, + help='结束日期 (格式: YYYY-MM-DD)' + ) + + parser.add_argument( + '--list-websites', '-l', + action='store_true', + help='列出所有支持的网站' + ) + + parser.add_argument( + '--output', '-o', + type=str, + help='将结果保存到JSON文件' + ) + + parser.add_argument( + '--skip-existing', + action='store_true', + help='跳过已存在的网站配置' + ) + + def handle(self, *args, **options): + # 列出支持的网站 + if options['list_websites']: + self.stdout.write(self.style.SUCCESS("支持的网站列表:")) + for i, website in enumerate(WEBSITE_SEARCH_CONFIGS.keys(), 1): + self.stdout.write(f"{i:2d}. {website}") + return + + mode = options['mode'] + keyword = options['keyword'] + websites = options['websites'] + max_pages = options['max_pages'] + max_search_pages = options['max_search_pages'] + max_articles = options['max_articles'] + start_date = options['start_date'] + end_date = options['end_date'] + output_file = options['output'] + skip_existing = options['skip_existing'] + + # 验证网站名称 + if websites: + invalid_websites = [w for w in websites if w not in WEBSITE_SEARCH_CONFIGS] + if invalid_websites: + self.stdout.write( + self.style.ERROR(f"不支持的网站: {', '.join(invalid_websites)}") + ) + self.stdout.write("使用 --list-websites 查看支持的网站列表") + return + + # 确定要爬取的网站列表 + target_websites = websites if websites else list(WEBSITE_SEARCH_CONFIGS.keys()) + + # 验证关键词模式 + if mode in ['keyword', 'both'] and not keyword: + self.stdout.write( + self.style.ERROR("关键词模式需要指定 --keyword 参数") + ) + return + + self.stdout.write(f"开始一键爬取任务...") + self.stdout.write(f"爬取模式: {mode}") + self.stdout.write(f"目标网站: {', '.join(target_websites)}") + if keyword: + self.stdout.write(f"关键词: {keyword}") + if start_date: + self.stdout.write(f"开始日期: {start_date}") + if end_date: + self.stdout.write(f"结束日期: {end_date}") + + all_results = { + "mode": mode, + "websites": target_websites, + "keyword": keyword, + "start_date": start_date, + "end_date": end_date, + "full_crawl_results": {}, + "keyword_crawl_results": {}, + "summary": { + "total_websites": len(target_websites), + "full_crawl_success": 0, + "full_crawl_failed": 0, + "keyword_crawl_success": 0, + "keyword_crawl_failed": 0 + } + } + + try: + for website_name in target_websites: + self.stdout.write(f"\n{'='*50}") + self.stdout.write(f"开始处理网站: {website_name}") + self.stdout.write(f"{'='*50}") + + # 获取或创建网站对象 + website, created = Website.objects.get_or_create( + name=website_name, + defaults={ + 'base_url': WEBSITE_SEARCH_CONFIGS[website_name]["search_url"], + 'enabled': True + } + ) + + if not created and skip_existing: + self.stdout.write(f"跳过已存在的网站: {website_name}") + continue + + website_results = { + "full_crawl": None, + "keyword_crawl": None + } + + # 全站爬取 + if mode in ['full', 'both']: + self.stdout.write(f"\n开始全站爬取: {website_name}") + try: + full_site_crawler( + WEBSITE_SEARCH_CONFIGS[website_name]["search_url"], + website, + max_pages=max_pages + ) + self.stdout.write(self.style.SUCCESS(f"全站爬取完成: {website_name}")) + website_results["full_crawl"] = {"status": "success"} + all_results["summary"]["full_crawl_success"] += 1 + except Exception as e: + self.stdout.write(self.style.ERROR(f"全站爬取失败: {website_name}, 错误: {e}")) + website_results["full_crawl"] = {"status": "failed", "error": str(e)} + all_results["summary"]["full_crawl_failed"] += 1 + + # 关键词爬取 + if mode in ['keyword', 'both']: + self.stdout.write(f"\n开始关键词爬取: {website_name}") + try: + keyword_results = crawl_by_keyword( + keyword=keyword, + website_names=[website_name], + max_pages=max_search_pages, + start_date=start_date, + end_date=end_date, + max_articles=max_articles + ) + website_results["keyword_crawl"] = keyword_results + if keyword_results["success_count"] > 0: + all_results["summary"]["keyword_crawl_success"] += 1 + else: + all_results["summary"]["keyword_crawl_failed"] += 1 + except Exception as e: + self.stdout.write(self.style.ERROR(f"关键词爬取失败: {website_name}, 错误: {e}")) + website_results["keyword_crawl"] = {"status": "failed", "error": str(e)} + all_results["summary"]["keyword_crawl_failed"] += 1 + + all_results["full_crawl_results"][website_name] = website_results["full_crawl"] + all_results["keyword_crawl_results"][website_name] = website_results["keyword_crawl"] + + # 显示最终结果摘要 + self.stdout.write(f"\n{'='*50}") + self.stdout.write(self.style.SUCCESS("一键爬取完成!")) + self.stdout.write(f"{'='*50}") + self.stdout.write(f"总网站数: {all_results['summary']['total_websites']}") + + if mode in ['full', 'both']: + self.stdout.write(f"全站爬取 - 成功: {all_results['summary']['full_crawl_success']}, " + f"失败: {all_results['summary']['full_crawl_failed']}") + + if mode in ['keyword', 'both']: + self.stdout.write(f"关键词爬取 - 成功: {all_results['summary']['keyword_crawl_success']}, " + f"失败: {all_results['summary']['keyword_crawl_failed']}") + + # 显示各网站详细结果 + self.stdout.write("\n各网站详细结果:") + for website_name in target_websites: + self.stdout.write(f"\n{website_name}:") + + if mode in ['full', 'both']: + full_result = all_results["full_crawl_results"][website_name] + if full_result and full_result.get("status") == "success": + self.stdout.write(self.style.SUCCESS(" 全站爬取: 成功")) + elif full_result: + self.stdout.write(self.style.ERROR(f" 全站爬取: 失败 - {full_result.get('error', '未知错误')}")) + + if mode in ['keyword', 'both']: + keyword_result = all_results["keyword_crawl_results"][website_name] + if keyword_result and "success_count" in keyword_result: + self.stdout.write(f" 关键词爬取: 成功 {keyword_result['success_count']} 篇, " + f"失败 {keyword_result['failed_count']} 篇") + elif keyword_result and keyword_result.get("status") == "failed": + self.stdout.write(self.style.ERROR(f" 关键词爬取: 失败 - {keyword_result.get('error', '未知错误')}")) + + # 保存结果到文件 + if output_file: + with open(output_file, 'w', encoding='utf-8') as f: + json.dump(all_results, f, ensure_ascii=False, indent=2) + self.stdout.write(f"\n结果已保存到: {output_file}") + + except Exception as e: + self.stdout.write(self.style.ERROR(f"一键爬取过程中出现错误: {e}")) + raise diff --git a/core/management/commands/crawl_by_keyword.py b/core/management/commands/crawl_by_keyword.py new file mode 100644 index 0000000..9624dfe --- /dev/null +++ b/core/management/commands/crawl_by_keyword.py @@ -0,0 +1,157 @@ +from django.core.management.base import BaseCommand +from core.utils import crawl_by_keyword, crawl_historical_articles, WEBSITE_SEARCH_CONFIGS +import json + + +class Command(BaseCommand): + help = "根据关键词爬取多个网站的文章" + + def add_arguments(self, parser): + parser.add_argument( + '--keyword', '-k', + type=str, + help='搜索关键词' + ) + + parser.add_argument( + '--websites', '-w', + type=str, + nargs='*', + help='指定要爬取的网站名称列表,如果不指定则爬取所有支持的网站' + ) + + parser.add_argument( + '--max-pages', '-p', + type=int, + default=10, + help='每个网站最大搜索页数 (默认: 10)' + ) + + parser.add_argument( + '--max-articles', '-a', + type=int, + default=100, + help='最大文章数量 (默认: 100)' + ) + + parser.add_argument( + '--start-date', '-s', + type=str, + help='开始日期 (格式: YYYY-MM-DD)' + ) + + parser.add_argument( + '--end-date', '-e', + type=str, + help='结束日期 (格式: YYYY-MM-DD)' + ) + + parser.add_argument( + '--historical', + action='store_true', + help='爬取历史文章模式' + ) + + parser.add_argument( + '--list-websites', '-l', + action='store_true', + help='列出所有支持的网站' + ) + + parser.add_argument( + '--output', '-o', + type=str, + help='将结果保存到JSON文件' + ) + + def handle(self, *args, **options): + # 列出支持的网站 + if options['list_websites']: + self.stdout.write(self.style.SUCCESS("支持的网站列表:")) + for i, website in enumerate(WEBSITE_SEARCH_CONFIGS.keys(), 1): + self.stdout.write(f"{i:2d}. {website}") + return + + keyword = options['keyword'] + if not keyword: + self.stdout.write(self.style.ERROR("必须指定 --keyword 参数")) + return + websites = options['websites'] + max_pages = options['max_pages'] + max_articles = options['max_articles'] + start_date = options['start_date'] + end_date = options['end_date'] + historical = options['historical'] + output_file = options['output'] + + # 验证网站名称 + if websites: + invalid_websites = [w for w in websites if w not in WEBSITE_SEARCH_CONFIGS] + if invalid_websites: + self.stdout.write( + self.style.ERROR(f"不支持的网站: {', '.join(invalid_websites)}") + ) + self.stdout.write("使用 --list-websites 查看支持的网站列表") + return + + self.stdout.write(f"开始爬取任务...") + self.stdout.write(f"关键词: {keyword}") + if websites: + self.stdout.write(f"目标网站: {', '.join(websites)}") + else: + self.stdout.write(f"目标网站: 所有支持的网站 ({len(WEBSITE_SEARCH_CONFIGS)}个)") + + if start_date: + self.stdout.write(f"开始日期: {start_date}") + if end_date: + self.stdout.write(f"结束日期: {end_date}") + self.stdout.write(f"最大页数: {max_pages}") + self.stdout.write(f"最大文章数: {max_articles}") + + try: + if historical: + # 历史文章爬取模式 + self.stdout.write(self.style.WARNING("使用历史文章爬取模式")) + results = crawl_historical_articles( + website_names=websites, + start_date=start_date, + end_date=end_date, + max_articles_per_site=max_articles + ) + else: + # 关键词搜索模式 + results = crawl_by_keyword( + keyword=keyword, + website_names=websites, + max_pages=max_pages, + start_date=start_date, + end_date=end_date, + max_articles=max_articles + ) + + # 显示结果摘要 + self.stdout.write(self.style.SUCCESS("\n爬取完成!")) + self.stdout.write(f"总文章数: {results['total_articles']}") + self.stdout.write(f"成功: {results['success_count']}") + self.stdout.write(f"失败: {results['failed_count']}") + + # 显示各网站详细结果 + self.stdout.write("\n各网站结果:") + for website, result in results['website_results'].items(): + status = self.style.SUCCESS if result['success'] > 0 else self.style.WARNING + self.stdout.write( + status(f" {website}: 找到 {result['found_urls']} 篇, " + f"成功 {result['success']}, 失败 {result['failed']}") + ) + if 'error' in result: + self.stdout.write(self.style.ERROR(f" 错误: {result['error']}")) + + # 保存结果到文件 + if output_file: + with open(output_file, 'w', encoding='utf-8') as f: + json.dump(results, f, ensure_ascii=False, indent=2) + self.stdout.write(f"\n结果已保存到: {output_file}") + + except Exception as e: + self.stdout.write(self.style.ERROR(f"爬取过程中出现错误: {e}")) + raise diff --git a/core/migrations/0002_crawltask.py b/core/migrations/0002_crawltask.py new file mode 100644 index 0000000..6a62f0e --- /dev/null +++ b/core/migrations/0002_crawltask.py @@ -0,0 +1,45 @@ +# Generated by Django 5.1 on 2025-09-23 19:28 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0001_initial'), + ] + + operations = [ + migrations.CreateModel( + name='CrawlTask', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('name', models.CharField(max_length=200, verbose_name='任务名称')), + ('task_type', models.CharField(choices=[('keyword', '关键词搜索'), ('historical', '历史文章'), ('full_site', '全站爬取')], default='keyword', max_length=20, verbose_name='任务类型')), + ('keyword', models.CharField(blank=True, max_length=200, null=True, verbose_name='搜索关键词')), + ('websites', models.JSONField(default=list, verbose_name='目标网站')), + ('start_date', models.DateField(blank=True, null=True, verbose_name='开始日期')), + ('end_date', models.DateField(blank=True, null=True, verbose_name='结束日期')), + ('max_pages', models.IntegerField(default=10, verbose_name='最大页数')), + ('max_articles', models.IntegerField(default=100, verbose_name='最大文章数')), + ('status', models.CharField(choices=[('pending', '等待中'), ('running', '运行中'), ('completed', '已完成'), ('failed', '失败'), ('cancelled', '已取消')], default='pending', max_length=20, verbose_name='状态')), + ('progress', models.IntegerField(default=0, verbose_name='进度百分比')), + ('current_website', models.CharField(blank=True, max_length=100, null=True, verbose_name='当前网站')), + ('current_action', models.CharField(blank=True, max_length=200, null=True, verbose_name='当前操作')), + ('total_articles', models.IntegerField(default=0, verbose_name='总文章数')), + ('success_count', models.IntegerField(default=0, verbose_name='成功数')), + ('failed_count', models.IntegerField(default=0, verbose_name='失败数')), + ('created_at', models.DateTimeField(auto_now_add=True, verbose_name='创建时间')), + ('started_at', models.DateTimeField(blank=True, null=True, verbose_name='开始时间')), + ('completed_at', models.DateTimeField(blank=True, null=True, verbose_name='完成时间')), + ('error_message', models.TextField(blank=True, null=True, verbose_name='错误信息')), + ('result_details', models.JSONField(blank=True, default=dict, verbose_name='结果详情')), + ('created_by', models.CharField(blank=True, max_length=100, null=True, verbose_name='创建者')), + ], + options={ + 'verbose_name': '爬取任务', + 'verbose_name_plural': '爬取任务', + 'ordering': ['-created_at'], + }, + ), + ] diff --git a/core/migrations/0003_remove_crawltask_websites_crawltask_websites.py b/core/migrations/0003_remove_crawltask_websites_crawltask_websites.py new file mode 100644 index 0000000..5d05397 --- /dev/null +++ b/core/migrations/0003_remove_crawltask_websites_crawltask_websites.py @@ -0,0 +1,22 @@ +# Generated by Django 5.1 on 2025-09-23 19:34 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0002_crawltask'), + ] + + operations = [ + migrations.RemoveField( + model_name='crawltask', + name='websites', + ), + migrations.AddField( + model_name='crawltask', + name='websites', + field=models.ManyToManyField(blank=True, to='core.website', verbose_name='目标网站'), + ), + ] diff --git a/core/models.py b/core/models.py index a176335..8b037dc 100644 --- a/core/models.py +++ b/core/models.py @@ -1,4 +1,6 @@ from django.db import models +from django.utils import timezone +import json class Website(models.Model): @@ -25,3 +27,93 @@ class Article(models.Model): def __str__(self): return self.title + + +class CrawlTask(models.Model): + """爬取任务模型""" + TASK_STATUS_CHOICES = [ + ('pending', '等待中'), + ('running', '运行中'), + ('completed', '已完成'), + ('failed', '失败'), + ('cancelled', '已取消'), + ] + + TASK_TYPE_CHOICES = [ + ('keyword', '关键词搜索'), + ('historical', '历史文章'), + ('full_site', '全站爬取'), + ] + + name = models.CharField(max_length=200, verbose_name="任务名称") + task_type = models.CharField(max_length=20, choices=TASK_TYPE_CHOICES, default='keyword', verbose_name="任务类型") + keyword = models.CharField(max_length=200, blank=True, null=True, verbose_name="搜索关键词") + websites = models.ManyToManyField(Website, blank=True, verbose_name="目标网站") + start_date = models.DateField(blank=True, null=True, verbose_name="开始日期") + end_date = models.DateField(blank=True, null=True, verbose_name="结束日期") + max_pages = models.IntegerField(default=10, verbose_name="最大页数") + max_articles = models.IntegerField(default=100, verbose_name="最大文章数") + + status = models.CharField(max_length=20, choices=TASK_STATUS_CHOICES, default='pending', verbose_name="状态") + progress = models.IntegerField(default=0, verbose_name="进度百分比") + current_website = models.CharField(max_length=100, blank=True, null=True, verbose_name="当前网站") + current_action = models.CharField(max_length=200, blank=True, null=True, verbose_name="当前操作") + + total_articles = models.IntegerField(default=0, verbose_name="总文章数") + success_count = models.IntegerField(default=0, verbose_name="成功数") + failed_count = models.IntegerField(default=0, verbose_name="失败数") + + created_at = models.DateTimeField(auto_now_add=True, verbose_name="创建时间") + started_at = models.DateTimeField(blank=True, null=True, verbose_name="开始时间") + completed_at = models.DateTimeField(blank=True, null=True, verbose_name="完成时间") + + error_message = models.TextField(blank=True, null=True, verbose_name="错误信息") + result_details = models.JSONField(default=dict, blank=True, verbose_name="结果详情") + + created_by = models.CharField(max_length=100, blank=True, null=True, verbose_name="创建者") + + class Meta: + verbose_name = "爬取任务" + verbose_name_plural = "爬取任务" + ordering = ['-created_at'] + + def __str__(self): + return f"{self.name} ({self.get_status_display()})" + + def get_websites_display(self): + """获取网站列表的显示文本""" + websites = self.websites.all() + if not websites: + return "所有网站" + return ", ".join([w.name for w in websites]) + + def get_duration(self): + """获取任务执行时长""" + if not self.started_at: + return None + end_time = self.completed_at or timezone.now() + return end_time - self.started_at + + def is_running(self): + """判断任务是否正在运行""" + return self.status == 'running' + + def can_cancel(self): + """判断任务是否可以取消""" + return self.status in ['pending', 'running'] + + def get_progress_display(self): + """获取进度显示文本""" + if self.status == 'pending': + return "等待开始" + elif self.status == 'running': + if self.current_website and self.current_action: + return f"正在处理 {self.current_website}: {self.current_action}" + return f"运行中 ({self.progress}%)" + elif self.status == 'completed': + return f"已完成 ({self.success_count}/{self.total_articles})" + elif self.status == 'failed': + return f"失败: {self.error_message[:50]}..." if self.error_message else "失败" + elif self.status == 'cancelled': + return "已取消" + return "未知状态" \ No newline at end of file diff --git a/core/static/admin/js/crawl_task_actions.js b/core/static/admin/js/crawl_task_actions.js new file mode 100644 index 0000000..bf6b676 --- /dev/null +++ b/core/static/admin/js/crawl_task_actions.js @@ -0,0 +1,84 @@ +/** + * 爬取任务操作JavaScript + */ + +function startTask(taskId) { + if (confirm('确定要启动这个任务吗?')) { + fetch(`/admin/core/crawltask/${taskId}/start/`, { + method: 'POST', + headers: { + 'X-CSRFToken': getCookie('csrftoken'), + 'Content-Type': 'application/x-www-form-urlencoded', + }, + }) + .then(response => { + if (response.ok) { + location.reload(); + } else { + alert('启动任务失败'); + } + }) + .catch(error => { + console.error('Error:', error); + alert('启动任务失败'); + }); + } +} + +function cancelTask(taskId) { + if (confirm('确定要取消这个任务吗?')) { + fetch(`/admin/core/crawltask/${taskId}/cancel/`, { + method: 'POST', + headers: { + 'X-CSRFToken': getCookie('csrftoken'), + 'Content-Type': 'application/x-www-form-urlencoded', + }, + }) + .then(response => { + if (response.ok) { + location.reload(); + } else { + alert('取消任务失败'); + } + }) + .catch(error => { + console.error('Error:', error); + alert('取消任务失败'); + }); + } +} + +function viewResults(taskId) { + window.open(`/admin/core/crawltask/${taskId}/results/`, '_blank'); +} + +function getCookie(name) { + let cookieValue = null; + if (document.cookie && document.cookie !== '') { + const cookies = document.cookie.split(';'); + for (let i = 0; i < cookies.length; i++) { + const cookie = cookies[i].trim(); + if (cookie.substring(0, name.length + 1) === (name + '=')) { + cookieValue = decodeURIComponent(cookie.substring(name.length + 1)); + break; + } + } + } + return cookieValue; +} + +// 自动刷新运行中的任务状态 +function autoRefreshRunningTasks() { + const runningTasks = document.querySelectorAll('[data-task-status="running"]'); + if (runningTasks.length > 0) { + // 每30秒刷新一次页面 + setTimeout(() => { + location.reload(); + }, 30000); + } +} + +// 页面加载完成后执行 +document.addEventListener('DOMContentLoaded', function() { + autoRefreshRunningTasks(); +}); diff --git a/core/task_executor.py b/core/task_executor.py new file mode 100644 index 0000000..915c3b7 --- /dev/null +++ b/core/task_executor.py @@ -0,0 +1,235 @@ +""" +爬取任务执行器 +负责执行爬取任务并更新任务状态 +""" + +import threading +import time +from django.utils import timezone +from django.db import transaction +from core.models import CrawlTask +from core.utils import crawl_by_keyword, crawl_historical_articles, full_site_crawler, WEBSITE_SEARCH_CONFIGS + + +class TaskExecutor: + """任务执行器""" + + def __init__(self): + self.running_tasks = {} + self.lock = threading.Lock() + + def start_task(self, task_id): + """启动任务""" + with self.lock: + if task_id in self.running_tasks: + return False, "任务已在运行中" + + try: + task = CrawlTask.objects.get(id=task_id) + if task.status != 'pending': + return False, "任务状态不允许启动" + + # 更新任务状态 + task.status = 'running' + task.started_at = timezone.now() + task.save() + + # 启动后台线程执行任务 + thread = threading.Thread(target=self._execute_task, args=(task_id,)) + thread.daemon = True + thread.start() + + self.running_tasks[task_id] = thread + return True, "任务已启动" + + except CrawlTask.DoesNotExist: + return False, "任务不存在" + except Exception as e: + return False, f"启动任务失败: {e}" + + def cancel_task(self, task_id): + """取消任务""" + with self.lock: + if task_id in self.running_tasks: + # 标记任务为取消状态 + try: + task = CrawlTask.objects.get(id=task_id) + task.status = 'cancelled' + task.completed_at = timezone.now() + task.save() + + # 移除运行中的任务 + del self.running_tasks[task_id] + return True, "任务已取消" + except CrawlTask.DoesNotExist: + return False, "任务不存在" + else: + return False, "任务未在运行中" + + def _execute_task(self, task_id): + """执行任务的核心逻辑""" + try: + task = CrawlTask.objects.get(id=task_id) + + # 根据任务类型执行不同的爬取逻辑 + if task.task_type == 'keyword': + self._execute_keyword_task(task) + elif task.task_type == 'historical': + self._execute_historical_task(task) + elif task.task_type == 'full_site': + self._execute_full_site_task(task) + else: + raise ValueError(f"不支持的任务类型: {task.task_type}") + + # 任务完成 + with transaction.atomic(): + task = CrawlTask.objects.select_for_update().get(id=task_id) + task.status = 'completed' + task.completed_at = timezone.now() + task.progress = 100 + task.save() + + except Exception as e: + # 任务失败 + try: + with transaction.atomic(): + task = CrawlTask.objects.select_for_update().get(id=task_id) + task.status = 'failed' + task.completed_at = timezone.now() + task.error_message = str(e) + task.save() + except: + pass + + finally: + # 清理运行中的任务记录 + with self.lock: + if task_id in self.running_tasks: + del self.running_tasks[task_id] + + def _execute_keyword_task(self, task): + """执行关键词搜索任务""" + # 更新当前操作 + task.current_action = "开始关键词搜索" + task.save() + + # 准备参数 + websites = task.websites if task.websites else list(WEBSITE_SEARCH_CONFIGS.keys()) + start_date = task.start_date.strftime('%Y-%m-%d') if task.start_date else None + end_date = task.end_date.strftime('%Y-%m-%d') if task.end_date else None + + # 执行爬取 + results = crawl_by_keyword( + keyword=task.keyword, + website_names=websites, + max_pages=task.max_pages, + start_date=start_date, + end_date=end_date, + max_articles=task.max_articles + ) + + # 更新结果 + task.total_articles = results['total_articles'] + task.success_count = results['success_count'] + task.failed_count = results['failed_count'] + task.result_details = results['website_results'] + task.save() + + def _execute_historical_task(self, task): + """执行历史文章任务""" + # 更新当前操作 + task.current_action = "开始历史文章爬取" + task.save() + + # 准备参数 + websites = task.websites if task.websites else list(WEBSITE_SEARCH_CONFIGS.keys()) + start_date = task.start_date.strftime('%Y-%m-%d') if task.start_date else None + end_date = task.end_date.strftime('%Y-%m-%d') if task.end_date else None + + # 执行爬取 + results = crawl_historical_articles( + website_names=websites, + start_date=start_date, + end_date=end_date, + max_articles_per_site=task.max_articles + ) + + # 更新结果 + task.total_articles = results['total_articles'] + task.success_count = results['success_count'] + task.failed_count = results['failed_count'] + task.result_details = results['website_results'] + task.save() + + def _execute_full_site_task(self, task): + """执行全站爬取任务""" + # 更新当前操作 + task.current_action = "开始全站爬取" + task.save() + + # 准备参数 + websites = task.websites if task.websites else list(WEBSITE_SEARCH_CONFIGS.keys()) + + total_websites = len(websites) + completed_websites = 0 + + for website_name in websites: + try: + # 更新当前网站 + task.current_website = website_name + task.current_action = f"正在爬取 {website_name}" + task.save() + + # 获取或创建网站对象 + from core.models import Website + website, created = Website.objects.get_or_create( + name=website_name, + defaults={ + 'base_url': WEBSITE_SEARCH_CONFIGS[website_name]["search_url"], + 'enabled': True + } + ) + + # 执行全站爬取 + full_site_crawler( + WEBSITE_SEARCH_CONFIGS[website_name]["search_url"], + website, + max_pages=task.max_pages + ) + + completed_websites += 1 + progress = int((completed_websites / total_websites) * 100) + task.progress = progress + task.save() + + except Exception as e: + # 记录错误但继续处理其他网站 + print(f"爬取网站 {website_name} 时出错: {e}") + continue + + # 更新最终结果 + task.total_articles = completed_websites # 这里可以改为实际爬取的文章数 + task.success_count = completed_websites + task.failed_count = total_websites - completed_websites + task.save() + + def get_task_status(self, task_id): + """获取任务状态""" + try: + task = CrawlTask.objects.get(id=task_id) + return { + 'status': task.status, + 'progress': task.progress, + 'current_website': task.current_website, + 'current_action': task.current_action, + 'total_articles': task.total_articles, + 'success_count': task.success_count, + 'failed_count': task.failed_count, + 'error_message': task.error_message + } + except CrawlTask.DoesNotExist: + return None + + +# 全局任务执行器实例 +task_executor = TaskExecutor() diff --git a/core/templates/admin/create_full_site_task.html b/core/templates/admin/create_full_site_task.html new file mode 100644 index 0000000..81cde17 --- /dev/null +++ b/core/templates/admin/create_full_site_task.html @@ -0,0 +1,139 @@ +{% extends "admin/base_site.html" %} +{% load i18n admin_urls static admin_modify %} + +{% block title %}{{ title }} | {{ site_title|default:_('Django site admin') }}{% endblock %} + +{% block breadcrumbs %} + +{% endblock %} + +{% block content %} +

{{ title }}

+ +
+ 注意:全站爬取会爬取整个网站的所有文章,可能需要很长时间。建议在非高峰时段进行。 +
+ +
+ {% csrf_token %} + +
+

基本信息

+
+
+ + +

为这个全站爬取任务起一个容易识别的名称

+
+
+
+ +
+

目标网站

+
+
+ +
+ +
+ {% for website in websites %} + + {% endfor %} +
+

不选择任何网站将爬取所有支持的网站

+
+
+
+ +
+

爬取设置

+
+
+ + +

每个网站最多爬取的页数 (1-5000)

+
+
+
+ +
+ + 取消 +
+
+ + + + +{% endblock %} \ No newline at end of file diff --git a/core/templates/admin/create_historical_task.html b/core/templates/admin/create_historical_task.html new file mode 100644 index 0000000..4e8920c --- /dev/null +++ b/core/templates/admin/create_historical_task.html @@ -0,0 +1,164 @@ +{% extends "admin/base_site.html" %} +{% load i18n admin_urls static admin_modify %} + +{% block title %}{{ title }} | {{ site_title|default:_('Django site admin') }}{% endblock %} + +{% block breadcrumbs %} + +{% endblock %} + +{% block content %} +

{{ title }}

+ +
+ {% csrf_token %} + +
+

基本信息

+
+
+ + +

为这个历史文章爬取任务起一个容易识别的名称

+
+
+
+ +
+

目标网站

+
+
+ +
+ +
+ {% for website in websites %} + + {% endfor %} +
+

不选择任何网站将爬取所有支持的网站

+
+
+
+ +
+

时间范围

+
+
+ + +

历史文章的开始日期

+
+
+ +
+
+ + +

历史文章的结束日期

+
+
+
+ +
+

爬取设置

+
+
+ + +

每个网站最多爬取的文章数量 (1-500)

+
+
+
+ +
+ + 取消 +
+
+ + + + +{% endblock %} \ No newline at end of file diff --git a/core/templates/admin/create_keyword_task.html b/core/templates/admin/create_keyword_task.html new file mode 100644 index 0000000..27bdaea --- /dev/null +++ b/core/templates/admin/create_keyword_task.html @@ -0,0 +1,180 @@ +{% extends "admin/base_site.html" %} +{% load i18n admin_urls static admin_modify %} + +{% block title %}{{ title }} | {{ site_title|default:_('Django site admin') }}{% endblock %} + +{% block breadcrumbs %} + +{% endblock %} + +{% block content %} +

{{ title }}

+ +
+ {% csrf_token %} + +
+

基本信息

+
+
+ + +

为这个爬取任务起一个容易识别的名称

+
+
+ +
+
+ + +

输入要搜索的关键词,例如:人工智能、两会、政策等

+
+
+
+ +
+

目标网站

+
+
+ +
+ +
+ {% for website in websites %} + + {% endfor %} +
+

不选择任何网站将爬取所有支持的网站

+
+
+
+ +
+

时间范围 (可选)

+
+
+ + +

留空则搜索所有时间

+
+
+ +
+
+ + +

留空则搜索到当前时间

+
+
+
+ +
+

爬取设置

+
+
+ + +

每个网站最多搜索的页数 (1-100)

+
+
+ +
+
+ + +

总共最多爬取的文章数量 (1-1000)

+
+
+
+ +
+ + 取消 +
+
+ + + + +{% endblock %} \ No newline at end of file diff --git a/core/templates/admin/index.html b/core/templates/admin/index.html new file mode 100644 index 0000000..1b5fa38 --- /dev/null +++ b/core/templates/admin/index.html @@ -0,0 +1,172 @@ +{% extends "admin/base_site.html" %} +{% load i18n static %} + +{% block extrastyle %}{{ block.super }}{% endblock %} + +{% block coltype %}colMS{% endblock %} + +{% block bodyclass %}{{ block.super }} dashboard{% endblock %} + +{% block breadcrumbs %}{% endblock %} + +{% block nav-sidebar %}{% endblock %} + +{% block content %} +
+ +{% if app_list %} + {% for app in app_list %} +
+ + + {% for model in app.models %} + + {% if model.admin_url %} + + {% else %} + + {% endif %} + + {% if model.add_url %} + + {% else %} + + {% endif %} + + {% if model.admin_url %} + {% if model.view_only %} + + {% else %} + + {% endif %} + {% else %} + + {% endif %} + + {% endfor %} +
+ {{ app.name }} +
{{ model.name }}{{ model.name }}{% translate 'Add' %} {% translate 'View' %}{% translate 'Change' %} 
+
+ {% endfor %} +{% else %} +

{% translate "You don't have permission to view or edit anything." %}

+{% endif %} + + +
+

快速创建爬取任务

+
+
+

关键词搜索

+

根据关键词搜索并爬取相关文章

+ + 创建任务 + +
+ +
+

历史文章

+

爬取指定日期范围的历史文章

+ + 创建任务 + +
+ +
+

全站爬取

+

爬取整个网站的所有文章

+ + 创建任务 + +
+
+
+ + +
+

最近任务状态

+
+ {% load core_extras %} + {% get_recent_tasks as recent_tasks %} + {% if recent_tasks %} + + + + + + + + + + + + + {% for task in recent_tasks %} + + + + + + + + + {% endfor %} + +
任务名称类型状态进度创建时间操作
{{ task.name }}{{ task.get_task_type_display }} + + {{ task.get_status_display }} + + + {% if task.status == 'running' %} +
+
+ {{ task.progress }}% +
+
+ {% else %} + - + {% endif %} +
{{ task.created_at|date:"m-d H:i" }} + 查看 +
+ {% else %} +

暂无任务

+ {% endif %} +
+
+ +
+{% endblock %} + +{% block sidebar %} + +{% endblock %} diff --git a/core/templates/admin/task_results.html b/core/templates/admin/task_results.html new file mode 100644 index 0000000..6276c4d --- /dev/null +++ b/core/templates/admin/task_results.html @@ -0,0 +1,184 @@ +{% extends "admin/base_site.html" %} +{% load i18n admin_urls static admin_modify %} + +{% block title %}{{ title }} | {{ site_title|default:_('Django site admin') }}{% endblock %} + +{% block breadcrumbs %} + +{% endblock %} + +{% block content %} +

{{ title }}

+ +
+

任务概览

+
+
+ 任务名称:
+ {{ task.name }} +
+
+ 任务类型:
+ {{ task.get_task_type_display }} +
+
+ 状态:
+ + {{ task.get_status_display }} + +
+
+ 创建时间:
+ {{ task.created_at|date:"Y-m-d H:i:s" }} +
+ {% if task.started_at %} +
+ 开始时间:
+ {{ task.started_at|date:"Y-m-d H:i:s" }} +
+ {% endif %} + {% if task.completed_at %} +
+ 完成时间:
+ {{ task.completed_at|date:"Y-m-d H:i:s" }} +
+ {% endif %} + {% if task.get_duration %} +
+ 执行时长:
+ {{ task.duration_display }} +
+ {% endif %} +
+
+ +
+

统计信息

+
+
+
{{ task.total_articles }}
+
总文章数
+
+
+
{{ task.success_count }}
+
成功数
+
+
+
{{ task.failed_count }}
+
失败数
+
+ {% if task.total_articles > 0 %} +
+
+ {% widthratio task.success_count task.total_articles 100 %}% +
+
成功率
+
+ {% endif %} +
+
+ +{% if task.keyword %} +
+

任务配置

+
+
+ 搜索关键词:
+ {{ task.keyword }} +
+
+ 目标网站:
+ {{ task.get_websites_display }} +
+ {% if task.start_date %} +
+ 开始日期:
+ {{ task.start_date }} +
+ {% endif %} + {% if task.end_date %} +
+ 结束日期:
+ {{ task.end_date }} +
+ {% endif %} +
+ 最大页数:
+ {{ task.max_pages }} +
+
+ 最大文章数:
+ {{ task.max_articles }} +
+
+
+{% endif %} + +{% if task.current_website or task.current_action %} +
+

当前状态

+ {% if task.current_website %} +
+ 当前网站: {{ task.current_website }} +
+ {% endif %} + {% if task.current_action %} +
+ 当前操作: {{ task.current_action }} +
+ {% endif %} + {% if task.status == 'running' %} +
+
+
+ {{ task.progress }}% +
+
+
+ {% endif %} +
+{% endif %} + +{% if task.error_message %} +
+

错误信息

+
{{ task.error_message }}
+
+{% endif %} + +{% if task.result_details %} +
+

详细结果

+ {% for website, result in task.result_details.items %} +
+ {{ website }}: +
    +
  • 找到链接: {{ result.found_urls }}
  • +
  • 已处理: {{ result.processed }}
  • +
  • 成功: {{ result.success }}
  • +
  • 失败: {{ result.failed }}
  • + {% if result.error %} +
  • 错误: {{ result.error }}
  • + {% endif %} +
+
+ {% endfor %} +
+{% endif %} + +
+ + 返回任务列表 + + {% if task.status == 'completed' %} + + 查看文章 + + {% endif %} +
+ +{% endblock %} diff --git a/core/templatetags/__init__.py b/core/templatetags/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/core/templatetags/core_extras.py b/core/templatetags/core_extras.py new file mode 100644 index 0000000..05b3996 --- /dev/null +++ b/core/templatetags/core_extras.py @@ -0,0 +1,46 @@ +from django import template +from django.core.cache import cache +from core.models import CrawlTask + +register = template.Library() + + +@register.simple_tag +def get_recent_tasks(limit=5): + """获取最近的任务""" + cache_key = f'recent_tasks_{limit}' + recent_tasks = cache.get(cache_key) + + if recent_tasks is None: + recent_tasks = CrawlTask.objects.all()[:limit] + cache.set(cache_key, recent_tasks, 60) # 缓存1分钟 + + return recent_tasks + + +@register.filter +def task_status_color(status): + """根据任务状态返回颜色""" + color_map = { + 'pending': 'gray', + 'running': 'blue', + 'completed': 'green', + 'failed': 'red', + 'cancelled': 'orange', + } + return color_map.get(status, 'gray') + + +@register.filter +def task_progress_bar(progress): + """生成进度条HTML""" + if progress is None: + progress = 0 + + return f''' +
+
+ {progress}% +
+
+ ''' diff --git a/core/utils.py b/core/utils.py index 1dbac76..09b488c 100644 --- a/core/utils.py +++ b/core/utils.py @@ -1,7 +1,7 @@ import os import requests from bs4 import BeautifulSoup -from urllib.parse import urljoin, urlparse +from urllib.parse import urljoin, urlparse, quote from collections import deque from django.utils import timezone from django.conf import settings @@ -15,6 +15,8 @@ from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from webdriver_manager.chrome import ChromeDriverManager +from datetime import datetime, timedelta +import json def get_selenium_driver(): @@ -2270,3 +2272,695 @@ def full_site_crawler(start_url, website, max_pages=1000): queue.append(href) elif href not in visited and is_valid_url(href, base_netloc): queue.append(href) + + +# 网站搜索配置 +WEBSITE_SEARCH_CONFIGS = { + "新华网": { + "search_url": "http://so.news.cn/getNews", + "search_params": { + "keyword": "{keyword}", + "curPage": "{page}", + "sortField": "0", + "sortType": "1" + }, + "method": "post", + "headers": { + "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" + } + }, + "人民日报": { + "search_url": "http://search.people.com.cn/search", + "search_params": { + "keyword": "{keyword}", + "st": "0", + "startDate": "{start_date}", + "endDate": "{end_date}", + "page": "{page}" + }, + "method": "get" + }, + "央视网": { + "search_url": "https://search.cctv.com/search.php", + "search_params": { + "qtext": "{keyword}", + "type": "web", + "page": "{page}" + }, + "method": "get" + }, + "光明日报": { + "search_url": "http://search.gmw.cn/search", + "search_params": { + "q": "{keyword}", + "page": "{page}" + }, + "method": "get" + }, + "经济日报": { + "search_url": "http://www.ce.cn/search", + "search_params": { + "keyword": "{keyword}", + "page": "{page}" + }, + "method": "get" + }, + "中国日报": { + "search_url": "http://www.chinadaily.com.cn/search", + "search_params": { + "q": "{keyword}", + "page": "{page}" + }, + "method": "get" + }, + "工人日报": { + "search_url": "https://www.workercn.cn/search", + "search_params": { + "keyword": "{keyword}", + "page": "{page}" + }, + "method": "get" + }, + "科技日报": { + "search_url": "http://www.stdaily.com/search", + "search_params": { + "keyword": "{keyword}", + "page": "{page}" + }, + "method": "get" + }, + "人民政协网": { + "search_url": "https://www.rmzxw.com.cn/search", + "search_params": { + "keyword": "{keyword}", + "page": "{page}" + }, + "method": "get" + }, + "中国纪检监察报": { + "search_url": "http://www.jjjcb.cn/search", + "search_params": { + "keyword": "{keyword}", + "page": "{page}" + }, + "method": "get" + }, + "中国新闻社": { + "search_url": "https://www.chinanews.com.cn/search", + "search_params": { + "keyword": "{keyword}", + "page": "{page}" + }, + "method": "get" + }, + "学习时报": { + "search_url": "https://www.studytimes.cn/search", + "search_params": { + "keyword": "{keyword}", + "page": "{page}" + }, + "method": "get" + }, + "中国青年报": { + "search_url": "http://news.cyol.com/search", + "search_params": { + "keyword": "{keyword}", + "page": "{page}" + }, + "method": "get" + }, + "中国妇女报": { + "search_url": "https://www.cnwomen.com.cn/search", + "search_params": { + "keyword": "{keyword}", + "page": "{page}" + }, + "method": "get" + }, + "法治日报": { + "search_url": "http://www.legaldaily.com.cn/search", + "search_params": { + "keyword": "{keyword}", + "page": "{page}" + }, + "method": "get" + }, + "农民日报": { + "search_url": "https://www.farmer.com.cn/search", + "search_params": { + "keyword": "{keyword}", + "page": "{page}" + }, + "method": "get" + }, + "学习强国": { + "search_url": "https://www.xuexi.cn/search", + "search_params": { + "keyword": "{keyword}", + "page": "{page}" + }, + "method": "get" + }, + "旗帜网": { + "search_url": "http://www.qizhiwang.org.cn/search", + "search_params": { + "keyword": "{keyword}", + "page": "{page}" + }, + "method": "get" + }, + "中国网": { + "search_url": "http://www.china.com.cn/search", + "search_params": { + "keyword": "{keyword}", + "page": "{page}" + }, + "method": "get" + }, + "中国政府网": { + "search_url": "https://www.gov.cn/search", + "search_params": { + "keyword": "{keyword}", + "page": "{page}" + }, + "method": "get" + }, + "求是网": { + "search_url": "http://www.qstheory.cn/search", + "search_params": { + "keyword": "{keyword}", + "page": "{page}" + }, + "method": "get" + }, + "解放军报": { + "search_url": "http://www.81.cn/search", + "search_params": { + "keyword": "{keyword}", + "page": "{page}" + }, + "method": "get" + } +} + + +def search_articles_by_keyword(website_name, keyword, max_pages=10, start_date=None, end_date=None): + """ + 根据关键词搜索文章 + + Args: + website_name: 网站名称 + keyword: 搜索关键词 + max_pages: 最大搜索页数 + start_date: 开始日期 (YYYY-MM-DD) + end_date: 结束日期 (YYYY-MM-DD) + + Returns: + list: 搜索到的文章URL列表 + """ + if website_name not in WEBSITE_SEARCH_CONFIGS: + print(f"网站 {website_name} 不支持搜索功能") + return [] + + config = WEBSITE_SEARCH_CONFIGS[website_name] + article_urls = [] + + # 设置默认日期范围 + if not start_date: + start_date = (datetime.now() - timedelta(days=365)).strftime("%Y-%m-%d") + if not end_date: + end_date = datetime.now().strftime("%Y-%m-%d") + + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" + } + headers.update(config.get("headers", {})) + + for page in range(1, max_pages + 1): + try: + # 构建搜索参数 + search_params = {} + for key, value in config["search_params"].items(): + search_params[key] = value.format( + keyword=quote(keyword), + page=page, + start_date=start_date, + end_date=end_date + ) + + print(f"搜索 {website_name} 第 {page} 页: {keyword}") + + if config["method"] == "post": + response = requests.post( + config["search_url"], + data=search_params, + headers=headers, + timeout=15 + ) + else: + response = requests.get( + config["search_url"], + params=search_params, + headers=headers, + timeout=15 + ) + + response.raise_for_status() + response.encoding = 'utf-8' + + # 解析搜索结果 + soup = BeautifulSoup(response.text, "html.parser") + page_urls = extract_search_results(soup, website_name) + + if not page_urls: + print(f"第 {page} 页没有找到更多结果") + break + + article_urls.extend(page_urls) + print(f"第 {page} 页找到 {len(page_urls)} 篇文章") + + # 避免请求过快 + time.sleep(1) + + except Exception as e: + print(f"搜索第 {page} 页时出错: {e}") + continue + + print(f"总共找到 {len(article_urls)} 篇文章") + return article_urls + + +def extract_search_results(soup, website_name): + """ + 从搜索结果页面提取文章链接 + + Args: + soup: BeautifulSoup对象 + website_name: 网站名称 + + Returns: + list: 文章URL列表 + """ + urls = [] + + # 根据不同网站的搜索结果结构提取链接 + if website_name == "新华网": + # 新华网搜索结果结构 + for link in soup.find_all("a", href=True): + href = link["href"] + if "/news/" in href or "/article/" in href: + urls.append(href) + + elif website_name == "人民日报": + # 人民日报搜索结果结构 + for link in soup.find_all("a", href=True): + href = link["href"] + if "/n1/" in href or "/article/" in href: + urls.append(href) + + elif website_name == "央视网": + # 央视网搜索结果结构 + for link in soup.find_all("a", href=True): + href = link["href"] + if "/news/" in href or "ARTI" in href: + urls.append(href) + + elif website_name == "光明日报": + # 光明日报搜索结果结构 + for link in soup.find_all("a", href=True): + href = link["href"] + if "/article/" in href or "/content/" in href: + urls.append(href) + + elif website_name == "经济日报": + # 经济日报搜索结果结构 + for link in soup.find_all("a", href=True): + href = link["href"] + if "/article/" in href or "/content/" in href: + urls.append(href) + + elif website_name == "中国日报": + # 中国日报搜索结果结构 + for link in soup.find_all("a", href=True): + href = link["href"] + if "/article/" in href or "/content/" in href: + urls.append(href) + + elif website_name == "工人日报": + # 工人日报搜索结果结构 + for link in soup.find_all("a", href=True): + href = link["href"] + if "/c/" in href or "/article/" in href: + urls.append(href) + + elif website_name == "科技日报": + # 科技日报搜索结果结构 + for link in soup.find_all("a", href=True): + href = link["href"] + if "/article/" in href or "/content/" in href: + urls.append(href) + + elif website_name == "人民政协网": + # 人民政协网搜索结果结构 + for link in soup.find_all("a", href=True): + href = link["href"] + if "/article/" in href or "/content/" in href: + urls.append(href) + + elif website_name == "中国纪检监察报": + # 中国纪检监察报搜索结果结构 + for link in soup.find_all("a", href=True): + href = link["href"] + if "/article/" in href or "/content/" in href: + urls.append(href) + + elif website_name == "中国新闻社": + # 中国新闻社搜索结果结构 + for link in soup.find_all("a", href=True): + href = link["href"] + if "/article/" in href or "/content/" in href: + urls.append(href) + + elif website_name == "学习时报": + # 学习时报搜索结果结构 + for link in soup.find_all("a", href=True): + href = link["href"] + if "/article/" in href or "/content/" in href: + urls.append(href) + + elif website_name == "中国青年报": + # 中国青年报搜索结果结构 + for link in soup.find_all("a", href=True): + href = link["href"] + if "/article/" in href or "/content/" in href: + urls.append(href) + + elif website_name == "中国妇女报": + # 中国妇女报搜索结果结构 + for link in soup.find_all("a", href=True): + href = link["href"] + if "/article/" in href or "/content/" in href: + urls.append(href) + + elif website_name == "法治日报": + # 法治日报搜索结果结构 + for link in soup.find_all("a", href=True): + href = link["href"] + if "/content/" in href and "content_" in href: + urls.append(href) + + elif website_name == "农民日报": + # 农民日报搜索结果结构 + for link in soup.find_all("a", href=True): + href = link["href"] + if "/article/" in href or "/content/" in href: + urls.append(href) + + elif website_name == "学习强国": + # 学习强国搜索结果结构 + for link in soup.find_all("a", href=True): + href = link["href"] + if "/article/" in href or "/content/" in href: + urls.append(href) + + elif website_name == "旗帜网": + # 旗帜网搜索结果结构 + for link in soup.find_all("a", href=True): + href = link["href"] + if "/n1/" in href or "/article/" in href: + urls.append(href) + + elif website_name == "中国网": + # 中国网搜索结果结构 + for link in soup.find_all("a", href=True): + href = link["href"] + if "/article/" in href or "/opinion/" in href: + urls.append(href) + + elif website_name == "中国政府网": + # 中国政府网搜索结果结构 + for link in soup.find_all("a", href=True): + href = link["href"] + if "/zhengce/" in href or "/xinwen/" in href: + urls.append(href) + + elif website_name == "求是网": + # 求是网搜索结果结构 + for link in soup.find_all("a", href=True): + href = link["href"] + if "/article/" in href or "/content/" in href: + urls.append(href) + + elif website_name == "解放军报": + # 解放军报搜索结果结构 + for link in soup.find_all("a", href=True): + href = link["href"] + if "/zt/" in href or "/article/" in href: + urls.append(href) + + # 去重并返回 + return list(set(urls)) + + +def crawl_by_keyword(keyword, website_names=None, max_pages=10, start_date=None, end_date=None, max_articles=100): + """ + 根据关键词爬取多个网站的文章 + + Args: + keyword: 搜索关键词 + website_names: 网站名称列表,如果为None则爬取所有支持的网站 + max_pages: 每个网站最大搜索页数 + start_date: 开始日期 (YYYY-MM-DD) + end_date: 结束日期 (YYYY-MM-DD) + max_articles: 最大文章数量 + + Returns: + dict: 爬取结果统计 + """ + if website_names is None: + website_names = list(WEBSITE_SEARCH_CONFIGS.keys()) + + results = { + "keyword": keyword, + "total_articles": 0, + "success_count": 0, + "failed_count": 0, + "website_results": {} + } + + print(f"开始根据关键词 '{keyword}' 爬取文章...") + print(f"目标网站: {', '.join(website_names)}") + + for website_name in website_names: + print(f"\n开始爬取 {website_name}...") + + try: + # 获取或创建网站对象 + from core.models import Website + website, created = Website.objects.get_or_create( + name=website_name, + defaults={ + 'base_url': WEBSITE_SEARCH_CONFIGS[website_name]["search_url"], + 'enabled': True + } + ) + + # 搜索文章URL + article_urls = search_articles_by_keyword( + website_name, keyword, max_pages, start_date, end_date + ) + + if not article_urls: + print(f"{website_name} 没有找到相关文章") + results["website_results"][website_name] = { + "found_urls": 0, + "processed": 0, + "success": 0, + "failed": 0 + } + continue + + # 限制文章数量 + if len(article_urls) > max_articles: + article_urls = article_urls[:max_articles] + + print(f"{website_name} 找到 {len(article_urls)} 篇文章,开始处理...") + + website_success = 0 + website_failed = 0 + + for i, url in enumerate(article_urls, 1): + try: + print(f"处理第 {i}/{len(article_urls)} 篇: {url}") + process_article(url, website) + website_success += 1 + results["success_count"] += 1 + + # 避免请求过快 + time.sleep(0.5) + + except Exception as e: + print(f"处理文章失败: {url}, 错误: {e}") + website_failed += 1 + results["failed_count"] += 1 + + results["website_results"][website_name] = { + "found_urls": len(article_urls), + "processed": len(article_urls), + "success": website_success, + "failed": website_failed + } + + print(f"{website_name} 完成: 成功 {website_success}, 失败 {website_failed}") + + except Exception as e: + print(f"爬取 {website_name} 时出错: {e}") + results["website_results"][website_name] = { + "found_urls": 0, + "processed": 0, + "success": 0, + "failed": 1, + "error": str(e) + } + results["failed_count"] += 1 + + results["total_articles"] = results["success_count"] + results["failed_count"] + + print(f"\n爬取完成!") + print(f"关键词: {keyword}") + print(f"总文章数: {results['total_articles']}") + print(f"成功: {results['success_count']}") + print(f"失败: {results['failed_count']}") + + return results + + +def crawl_historical_articles(website_names=None, start_date=None, end_date=None, max_articles_per_site=50): + """ + 爬取历史文章 + + Args: + website_names: 网站名称列表 + start_date: 开始日期 (YYYY-MM-DD) + end_date: 结束日期 (YYYY-MM-DD) + max_articles_per_site: 每个网站最大文章数 + + Returns: + dict: 爬取结果统计 + """ + if not start_date: + start_date = (datetime.now() - timedelta(days=30)).strftime("%Y-%m-%d") + if not end_date: + end_date = datetime.now().strftime("%Y-%m-%d") + + if website_names is None: + website_names = list(WEBSITE_SEARCH_CONFIGS.keys()) + + results = { + "start_date": start_date, + "end_date": end_date, + "total_articles": 0, + "success_count": 0, + "failed_count": 0, + "website_results": {} + } + + print(f"开始爬取历史文章...") + print(f"日期范围: {start_date} 到 {end_date}") + print(f"目标网站: {', '.join(website_names)}") + + # 使用通用关键词搜索历史文章 + common_keywords = ["新闻", "报道", "文章", "资讯", "动态"] + + for website_name in website_names: + print(f"\n开始爬取 {website_name} 历史文章...") + + try: + from core.models import Website + website, created = Website.objects.get_or_create( + name=website_name, + defaults={ + 'base_url': WEBSITE_SEARCH_CONFIGS[website_name]["search_url"], + 'enabled': True + } + ) + + website_success = 0 + website_failed = 0 + all_urls = set() + + # 使用多个关键词搜索 + for keyword in common_keywords: + try: + article_urls = search_articles_by_keyword( + website_name, keyword, max_pages=5, + start_date=start_date, end_date=end_date + ) + all_urls.update(article_urls) + + if len(all_urls) >= max_articles_per_site: + break + + except Exception as e: + print(f"搜索关键词 '{keyword}' 时出错: {e}") + continue + + # 限制文章数量 + article_urls = list(all_urls)[:max_articles_per_site] + + if not article_urls: + print(f"{website_name} 没有找到历史文章") + results["website_results"][website_name] = { + "found_urls": 0, + "processed": 0, + "success": 0, + "failed": 0 + } + continue + + print(f"{website_name} 找到 {len(article_urls)} 篇历史文章,开始处理...") + + for i, url in enumerate(article_urls, 1): + try: + print(f"处理第 {i}/{len(article_urls)} 篇: {url}") + process_article(url, website) + website_success += 1 + results["success_count"] += 1 + + time.sleep(0.5) + + except Exception as e: + print(f"处理文章失败: {url}, 错误: {e}") + website_failed += 1 + results["failed_count"] += 1 + + results["website_results"][website_name] = { + "found_urls": len(article_urls), + "processed": len(article_urls), + "success": website_success, + "failed": website_failed + } + + print(f"{website_name} 完成: 成功 {website_success}, 失败 {website_failed}") + + except Exception as e: + print(f"爬取 {website_name} 历史文章时出错: {e}") + results["website_results"][website_name] = { + "found_urls": 0, + "processed": 0, + "success": 0, + "failed": 1, + "error": str(e) + } + results["failed_count"] += 1 + + results["total_articles"] = results["success_count"] + results["failed_count"] + + print(f"\n历史文章爬取完成!") + print(f"日期范围: {start_date} 到 {end_date}") + print(f"总文章数: {results['total_articles']}") + print(f"成功: {results['success_count']}") + print(f"失败: {results['failed_count']}") + + return results