Support keword

This commit is contained in:
2025-09-24 03:38:32 +08:00
parent a4891b1c30
commit 8592833d74
16 changed files with 2888 additions and 2 deletions

View File

@@ -16,9 +16,10 @@ from django.utils import timezone
from django.db.models import Count, Q from django.db.models import Count, Q
from django.core.cache import cache from django.core.cache import cache
from .models import Website, Article from .models import Website, Article, CrawlTask
from .tasks import crawl_website, crawl_all_websites, cleanup_old_articles from .tasks import crawl_website, crawl_all_websites, cleanup_old_articles
from .distributed_crawler import distributed_crawler from .distributed_crawler import distributed_crawler
from .task_executor import task_executor
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -411,6 +412,419 @@ class ArticleAdmin(admin.ModelAdmin):
actions_column.short_description = '操作' actions_column.short_description = '操作'
class CrawlTaskStatusFilter(SimpleListFilter):
"""爬取任务状态过滤器"""
title = '任务状态'
parameter_name = 'status'
def lookups(self, request, model_admin):
return (
('pending', '等待中'),
('running', '运行中'),
('completed', '已完成'),
('failed', '失败'),
('cancelled', '已取消'),
)
def queryset(self, request, queryset):
if self.value():
return queryset.filter(status=self.value())
return queryset
class CrawlTaskTypeFilter(SimpleListFilter):
"""爬取任务类型过滤器"""
title = '任务类型'
parameter_name = 'task_type'
def lookups(self, request, model_admin):
return (
('keyword', '关键词搜索'),
('historical', '历史文章'),
('full_site', '全站爬取'),
)
def queryset(self, request, queryset):
if self.value():
return queryset.filter(task_type=self.value())
return queryset
class CrawlTaskAdmin(admin.ModelAdmin):
"""爬取任务管理"""
list_display = [
'name', 'task_type', 'keyword', 'websites_display', 'status',
'progress_display', 'created_at', 'duration_display', 'actions_column'
]
list_filter = [CrawlTaskStatusFilter, CrawlTaskTypeFilter, 'created_at']
search_fields = ['name', 'keyword', 'created_by']
readonly_fields = [
'status', 'progress', 'current_website', 'current_action',
'total_articles', 'success_count', 'failed_count',
'created_at', 'started_at', 'completed_at', 'error_message',
'result_details', 'duration_display', 'progress_display'
]
actions = ['start_tasks', 'cancel_tasks', 'delete_completed_tasks']
class Media:
js = ('admin/js/crawl_task_actions.js',)
fieldsets = (
('基本信息', {
'fields': ('name', 'task_type', 'keyword')
}),
('爬取配置', {
'fields': ('websites', 'start_date', 'end_date', 'max_pages', 'max_articles')
}),
('任务状态', {
'fields': ('status', 'progress_display', 'current_website', 'current_action'),
'classes': ('collapse',)
}),
('统计信息', {
'fields': ('total_articles', 'success_count', 'failed_count'),
'classes': ('collapse',)
}),
('时间信息', {
'fields': ('created_at', 'started_at', 'completed_at', 'duration_display'),
'classes': ('collapse',)
}),
('错误信息', {
'fields': ('error_message',),
'classes': ('collapse',)
}),
('结果详情', {
'fields': ('result_details',),
'classes': ('collapse',)
}),
)
def websites_display(self, obj):
"""网站列表显示"""
return obj.get_websites_display()
websites_display.short_description = '目标网站'
def progress_display(self, obj):
"""进度显示"""
if obj.status == 'running':
return format_html(
'<div style="width: 100px; background-color: #f0f0f0; border-radius: 3px;">'
'<div style="width: {}%; background-color: #4CAF50; height: 20px; border-radius: 3px; text-align: center; color: white; line-height: 20px;">{}%</div>'
'</div>',
obj.progress, obj.progress
)
elif obj.status == 'completed':
return format_html('<span style="color: green;">✓ 完成</span>')
elif obj.status == 'failed':
return format_html('<span style="color: red;">✗ 失败</span>')
elif obj.status == 'cancelled':
return format_html('<span style="color: orange;">⊘ 已取消</span>')
else:
return format_html('<span style="color: gray;">⏳ 等待</span>')
progress_display.short_description = '进度'
def duration_display(self, obj):
"""执行时长显示"""
duration = obj.get_duration()
if duration:
total_seconds = int(duration.total_seconds())
hours = total_seconds // 3600
minutes = (total_seconds % 3600) // 60
seconds = total_seconds % 60
if hours > 0:
return f"{hours}小时{minutes}分钟"
elif minutes > 0:
return f"{minutes}分钟{seconds}"
else:
return f"{seconds}"
return "-"
duration_display.short_description = '执行时长'
def actions_column(self, obj):
"""操作列"""
actions = []
if obj.status == 'pending':
actions.append(f'<a href="javascript:void(0)" onclick="startTask({obj.id})" class="button">开始</a>')
if obj.can_cancel():
actions.append(f'<a href="javascript:void(0)" onclick="cancelTask({obj.id})" class="button">取消</a>')
if obj.status == 'completed':
actions.append(f'<a href="javascript:void(0)" onclick="viewResults({obj.id})" class="button">查看结果</a>')
return format_html(' '.join(actions))
actions_column.short_description = '操作'
def start_tasks(self, request, queryset):
"""启动选中的任务"""
started_count = 0
for task in queryset.filter(status='pending'):
try:
success, message = task_executor.start_task(task.id)
if success:
started_count += 1
else:
self.message_user(request, f'启动任务 {task.name} 失败: {message}', messages.ERROR)
except Exception as e:
self.message_user(request, f'启动任务 {task.name} 失败: {e}', messages.ERROR)
if started_count > 0:
self.message_user(request, f'成功启动 {started_count} 个任务', messages.SUCCESS)
start_tasks.short_description = '启动选中的任务'
def cancel_tasks(self, request, queryset):
"""取消选中的任务"""
cancelled_count = 0
for task in queryset.filter(status__in=['pending', 'running']):
try:
success, message = task_executor.cancel_task(task.id)
if success:
cancelled_count += 1
else:
self.message_user(request, f'取消任务 {task.name} 失败: {message}', messages.ERROR)
except Exception as e:
self.message_user(request, f'取消任务 {task.name} 失败: {e}', messages.ERROR)
if cancelled_count > 0:
self.message_user(request, f'成功取消 {cancelled_count} 个任务', messages.SUCCESS)
cancel_tasks.short_description = '取消选中的任务'
def delete_completed_tasks(self, request, queryset):
"""删除已完成的任务"""
completed_tasks = queryset.filter(status__in=['completed', 'failed', 'cancelled'])
count = completed_tasks.count()
completed_tasks.delete()
if count > 0:
self.message_user(request, f'成功删除 {count} 个已完成的任务', messages.SUCCESS)
delete_completed_tasks.short_description = '删除已完成的任务'
def get_urls(self):
"""添加自定义URL"""
urls = super().get_urls()
custom_urls = [
path(
'create-keyword-task/',
self.admin_site.admin_view(self.create_keyword_task_view),
name='create_keyword_task',
),
path(
'create-historical-task/',
self.admin_site.admin_view(self.create_historical_task_view),
name='create_historical_task',
),
path(
'create-full-site-task/',
self.admin_site.admin_view(self.create_full_site_task_view),
name='create_full_site_task',
),
path(
'<int:task_id>/start/',
self.admin_site.admin_view(self.start_task_view),
name='start_task',
),
path(
'<int:task_id>/cancel/',
self.admin_site.admin_view(self.cancel_task_view),
name='cancel_task',
),
path(
'<int:task_id>/results/',
self.admin_site.admin_view(self.view_results_view),
name='view_results',
),
]
return custom_urls + urls
def create_keyword_task_view(self, request):
"""创建关键词搜索任务视图"""
if request.method == 'POST':
try:
from .utils import WEBSITE_SEARCH_CONFIGS
name = request.POST.get('name', '')
keyword = request.POST.get('keyword', '')
websites = request.POST.getlist('websites')
start_date = request.POST.get('start_date')
end_date = request.POST.get('end_date')
max_pages = int(request.POST.get('max_pages', 10))
max_articles = int(request.POST.get('max_articles', 100))
if not name or not keyword:
self.message_user(request, '任务名称和关键词不能为空', messages.ERROR)
return HttpResponseRedirect(reverse('admin:core_crawltask_changelist'))
# 创建任务
task = CrawlTask.objects.create(
name=name,
task_type='keyword',
keyword=keyword,
start_date=start_date if start_date else None,
end_date=end_date if end_date else None,
max_pages=max_pages,
max_articles=max_articles,
created_by=request.user.username if request.user.is_authenticated else 'admin'
)
# 添加选择的网站
if websites:
website_objects = Website.objects.filter(name__in=websites)
task.websites.set(website_objects)
self.message_user(request, f'关键词搜索任务 "{name}" 创建成功', messages.SUCCESS)
return HttpResponseRedirect(reverse('admin:core_crawltask_change', args=[task.id]))
except Exception as e:
self.message_user(request, f'创建任务失败: {e}', messages.ERROR)
# GET请求显示创建表单
context = {
'websites': Website.objects.filter(enabled=True),
'title': '创建关键词搜索任务'
}
return admin.site.admin_view(self.render_create_task_template)(request, 'admin/create_keyword_task.html', context)
def create_historical_task_view(self, request):
"""创建历史文章任务视图"""
if request.method == 'POST':
try:
from .utils import WEBSITE_SEARCH_CONFIGS
name = request.POST.get('name', '')
websites = request.POST.getlist('websites')
start_date = request.POST.get('start_date')
end_date = request.POST.get('end_date')
max_articles = int(request.POST.get('max_articles', 50))
if not name:
self.message_user(request, '任务名称不能为空', messages.ERROR)
return HttpResponseRedirect(reverse('admin:core_crawltask_changelist'))
# 创建任务
task = CrawlTask.objects.create(
name=name,
task_type='historical',
keyword='历史文章',
start_date=start_date if start_date else None,
end_date=end_date if end_date else None,
max_articles=max_articles,
created_by=request.user.username if request.user.is_authenticated else 'admin'
)
# 添加选择的网站
if websites:
website_objects = Website.objects.filter(name__in=websites)
task.websites.set(website_objects)
self.message_user(request, f'历史文章任务 "{name}" 创建成功', messages.SUCCESS)
return HttpResponseRedirect(reverse('admin:core_crawltask_change', args=[task.id]))
except Exception as e:
self.message_user(request, f'创建任务失败: {e}', messages.ERROR)
# GET请求显示创建表单
context = {
'websites': Website.objects.filter(enabled=True),
'title': '创建历史文章任务'
}
return admin.site.admin_view(self.render_create_task_template)(request, 'admin/create_historical_task.html', context)
def create_full_site_task_view(self, request):
"""创建全站爬取任务视图"""
if request.method == 'POST':
try:
from .utils import WEBSITE_SEARCH_CONFIGS
name = request.POST.get('name', '')
websites = request.POST.getlist('websites')
max_pages = int(request.POST.get('max_pages', 500))
if not name:
self.message_user(request, '任务名称不能为空', messages.ERROR)
return HttpResponseRedirect(reverse('admin:core_crawltask_changelist'))
# 创建任务
task = CrawlTask.objects.create(
name=name,
task_type='full_site',
keyword='全站爬取',
max_pages=max_pages,
created_by=request.user.username if request.user.is_authenticated else 'admin'
)
# 添加选择的网站
if websites:
website_objects = Website.objects.filter(name__in=websites)
task.websites.set(website_objects)
self.message_user(request, f'全站爬取任务 "{name}" 创建成功', messages.SUCCESS)
return HttpResponseRedirect(reverse('admin:core_crawltask_change', args=[task.id]))
except Exception as e:
self.message_user(request, f'创建任务失败: {e}', messages.ERROR)
# GET请求显示创建表单
context = {
'websites': Website.objects.filter(enabled=True),
'title': '创建全站爬取任务'
}
return admin.site.admin_view(self.render_create_task_template)(request, 'admin/create_full_site_task.html', context)
def start_task_view(self, request, task_id):
"""启动任务视图"""
try:
success, message = task_executor.start_task(task_id)
if success:
self.message_user(request, f'任务已启动: {message}', messages.SUCCESS)
else:
self.message_user(request, f'启动任务失败: {message}', messages.ERROR)
except Exception as e:
self.message_user(request, f'启动任务失败: {e}', messages.ERROR)
return HttpResponseRedirect(reverse('admin:core_crawltask_changelist'))
def cancel_task_view(self, request, task_id):
"""取消任务视图"""
try:
success, message = task_executor.cancel_task(task_id)
if success:
self.message_user(request, f'任务已取消: {message}', messages.SUCCESS)
else:
self.message_user(request, f'取消任务失败: {message}', messages.ERROR)
except Exception as e:
self.message_user(request, f'取消任务失败: {e}', messages.ERROR)
return HttpResponseRedirect(reverse('admin:core_crawltask_changelist'))
def view_results_view(self, request, task_id):
"""查看结果视图"""
try:
task = CrawlTask.objects.get(id=task_id)
context = {
'task': task,
'title': f'任务结果 - {task.name}'
}
return admin.site.admin_view(self.render_create_task_template)(request, 'admin/task_results.html', context)
except CrawlTask.DoesNotExist:
self.message_user(request, '任务不存在', messages.ERROR)
return HttpResponseRedirect(reverse('admin:core_crawltask_changelist'))
def render_create_task_template(self, request, template_name, context):
"""渲染创建任务模板"""
from django.template.loader import render_to_string
from django.http import HttpResponse
context.update({
'site_header': admin.site.site_header,
'site_title': admin.site.site_title,
'has_permission': True,
'user': request.user,
})
html = render_to_string(template_name, context)
return HttpResponse(html)
#class CrawlerStatusAdmin(admin.ModelAdmin): #class CrawlerStatusAdmin(admin.ModelAdmin):
# """爬虫状态管理""" # """爬虫状态管理"""
# change_list_template = 'admin/crawler_status.html' # change_list_template = 'admin/crawler_status.html'
@@ -448,6 +862,7 @@ class ArticleAdmin(admin.ModelAdmin):
# 注册管理类 # 注册管理类
admin.site.register(Website, WebsiteAdmin) admin.site.register(Website, WebsiteAdmin)
admin.site.register(Article, ArticleAdmin) admin.site.register(Article, ArticleAdmin)
admin.site.register(CrawlTask, CrawlTaskAdmin)
# 隐藏Celery Results管理功能 # 隐藏Celery Results管理功能

View File

@@ -0,0 +1,257 @@
from django.core.management.base import BaseCommand
from core.utils import full_site_crawler, crawl_by_keyword, WEBSITE_SEARCH_CONFIGS
from core.models import Website
import json
class Command(BaseCommand):
help = "一键爬取所有支持的网站"
def add_arguments(self, parser):
parser.add_argument(
'--mode', '-m',
type=str,
choices=['full', 'keyword', 'both'],
default='full',
help='爬取模式: full(全站爬取), keyword(关键词爬取), both(两种模式)'
)
parser.add_argument(
'--keyword', '-k',
type=str,
help='关键词搜索模式下的搜索关键词'
)
parser.add_argument(
'--websites', '-w',
type=str,
nargs='*',
help='指定要爬取的网站名称列表,如果不指定则爬取所有支持的网站'
)
parser.add_argument(
'--max-pages', '-p',
type=int,
default=500,
help='全站爬取最大页数 (默认: 500)'
)
parser.add_argument(
'--max-search-pages', '-sp',
type=int,
default=10,
help='关键词搜索最大页数 (默认: 10)'
)
parser.add_argument(
'--max-articles', '-a',
type=int,
default=100,
help='关键词搜索最大文章数量 (默认: 100)'
)
parser.add_argument(
'--start-date', '-s',
type=str,
help='开始日期 (格式: YYYY-MM-DD)'
)
parser.add_argument(
'--end-date', '-e',
type=str,
help='结束日期 (格式: YYYY-MM-DD)'
)
parser.add_argument(
'--list-websites', '-l',
action='store_true',
help='列出所有支持的网站'
)
parser.add_argument(
'--output', '-o',
type=str,
help='将结果保存到JSON文件'
)
parser.add_argument(
'--skip-existing',
action='store_true',
help='跳过已存在的网站配置'
)
def handle(self, *args, **options):
# 列出支持的网站
if options['list_websites']:
self.stdout.write(self.style.SUCCESS("支持的网站列表:"))
for i, website in enumerate(WEBSITE_SEARCH_CONFIGS.keys(), 1):
self.stdout.write(f"{i:2d}. {website}")
return
mode = options['mode']
keyword = options['keyword']
websites = options['websites']
max_pages = options['max_pages']
max_search_pages = options['max_search_pages']
max_articles = options['max_articles']
start_date = options['start_date']
end_date = options['end_date']
output_file = options['output']
skip_existing = options['skip_existing']
# 验证网站名称
if websites:
invalid_websites = [w for w in websites if w not in WEBSITE_SEARCH_CONFIGS]
if invalid_websites:
self.stdout.write(
self.style.ERROR(f"不支持的网站: {', '.join(invalid_websites)}")
)
self.stdout.write("使用 --list-websites 查看支持的网站列表")
return
# 确定要爬取的网站列表
target_websites = websites if websites else list(WEBSITE_SEARCH_CONFIGS.keys())
# 验证关键词模式
if mode in ['keyword', 'both'] and not keyword:
self.stdout.write(
self.style.ERROR("关键词模式需要指定 --keyword 参数")
)
return
self.stdout.write(f"开始一键爬取任务...")
self.stdout.write(f"爬取模式: {mode}")
self.stdout.write(f"目标网站: {', '.join(target_websites)}")
if keyword:
self.stdout.write(f"关键词: {keyword}")
if start_date:
self.stdout.write(f"开始日期: {start_date}")
if end_date:
self.stdout.write(f"结束日期: {end_date}")
all_results = {
"mode": mode,
"websites": target_websites,
"keyword": keyword,
"start_date": start_date,
"end_date": end_date,
"full_crawl_results": {},
"keyword_crawl_results": {},
"summary": {
"total_websites": len(target_websites),
"full_crawl_success": 0,
"full_crawl_failed": 0,
"keyword_crawl_success": 0,
"keyword_crawl_failed": 0
}
}
try:
for website_name in target_websites:
self.stdout.write(f"\n{'='*50}")
self.stdout.write(f"开始处理网站: {website_name}")
self.stdout.write(f"{'='*50}")
# 获取或创建网站对象
website, created = Website.objects.get_or_create(
name=website_name,
defaults={
'base_url': WEBSITE_SEARCH_CONFIGS[website_name]["search_url"],
'enabled': True
}
)
if not created and skip_existing:
self.stdout.write(f"跳过已存在的网站: {website_name}")
continue
website_results = {
"full_crawl": None,
"keyword_crawl": None
}
# 全站爬取
if mode in ['full', 'both']:
self.stdout.write(f"\n开始全站爬取: {website_name}")
try:
full_site_crawler(
WEBSITE_SEARCH_CONFIGS[website_name]["search_url"],
website,
max_pages=max_pages
)
self.stdout.write(self.style.SUCCESS(f"全站爬取完成: {website_name}"))
website_results["full_crawl"] = {"status": "success"}
all_results["summary"]["full_crawl_success"] += 1
except Exception as e:
self.stdout.write(self.style.ERROR(f"全站爬取失败: {website_name}, 错误: {e}"))
website_results["full_crawl"] = {"status": "failed", "error": str(e)}
all_results["summary"]["full_crawl_failed"] += 1
# 关键词爬取
if mode in ['keyword', 'both']:
self.stdout.write(f"\n开始关键词爬取: {website_name}")
try:
keyword_results = crawl_by_keyword(
keyword=keyword,
website_names=[website_name],
max_pages=max_search_pages,
start_date=start_date,
end_date=end_date,
max_articles=max_articles
)
website_results["keyword_crawl"] = keyword_results
if keyword_results["success_count"] > 0:
all_results["summary"]["keyword_crawl_success"] += 1
else:
all_results["summary"]["keyword_crawl_failed"] += 1
except Exception as e:
self.stdout.write(self.style.ERROR(f"关键词爬取失败: {website_name}, 错误: {e}"))
website_results["keyword_crawl"] = {"status": "failed", "error": str(e)}
all_results["summary"]["keyword_crawl_failed"] += 1
all_results["full_crawl_results"][website_name] = website_results["full_crawl"]
all_results["keyword_crawl_results"][website_name] = website_results["keyword_crawl"]
# 显示最终结果摘要
self.stdout.write(f"\n{'='*50}")
self.stdout.write(self.style.SUCCESS("一键爬取完成!"))
self.stdout.write(f"{'='*50}")
self.stdout.write(f"总网站数: {all_results['summary']['total_websites']}")
if mode in ['full', 'both']:
self.stdout.write(f"全站爬取 - 成功: {all_results['summary']['full_crawl_success']}, "
f"失败: {all_results['summary']['full_crawl_failed']}")
if mode in ['keyword', 'both']:
self.stdout.write(f"关键词爬取 - 成功: {all_results['summary']['keyword_crawl_success']}, "
f"失败: {all_results['summary']['keyword_crawl_failed']}")
# 显示各网站详细结果
self.stdout.write("\n各网站详细结果:")
for website_name in target_websites:
self.stdout.write(f"\n{website_name}:")
if mode in ['full', 'both']:
full_result = all_results["full_crawl_results"][website_name]
if full_result and full_result.get("status") == "success":
self.stdout.write(self.style.SUCCESS(" 全站爬取: 成功"))
elif full_result:
self.stdout.write(self.style.ERROR(f" 全站爬取: 失败 - {full_result.get('error', '未知错误')}"))
if mode in ['keyword', 'both']:
keyword_result = all_results["keyword_crawl_results"][website_name]
if keyword_result and "success_count" in keyword_result:
self.stdout.write(f" 关键词爬取: 成功 {keyword_result['success_count']} 篇, "
f"失败 {keyword_result['failed_count']}")
elif keyword_result and keyword_result.get("status") == "failed":
self.stdout.write(self.style.ERROR(f" 关键词爬取: 失败 - {keyword_result.get('error', '未知错误')}"))
# 保存结果到文件
if output_file:
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(all_results, f, ensure_ascii=False, indent=2)
self.stdout.write(f"\n结果已保存到: {output_file}")
except Exception as e:
self.stdout.write(self.style.ERROR(f"一键爬取过程中出现错误: {e}"))
raise

View File

@@ -0,0 +1,157 @@
from django.core.management.base import BaseCommand
from core.utils import crawl_by_keyword, crawl_historical_articles, WEBSITE_SEARCH_CONFIGS
import json
class Command(BaseCommand):
help = "根据关键词爬取多个网站的文章"
def add_arguments(self, parser):
parser.add_argument(
'--keyword', '-k',
type=str,
help='搜索关键词'
)
parser.add_argument(
'--websites', '-w',
type=str,
nargs='*',
help='指定要爬取的网站名称列表,如果不指定则爬取所有支持的网站'
)
parser.add_argument(
'--max-pages', '-p',
type=int,
default=10,
help='每个网站最大搜索页数 (默认: 10)'
)
parser.add_argument(
'--max-articles', '-a',
type=int,
default=100,
help='最大文章数量 (默认: 100)'
)
parser.add_argument(
'--start-date', '-s',
type=str,
help='开始日期 (格式: YYYY-MM-DD)'
)
parser.add_argument(
'--end-date', '-e',
type=str,
help='结束日期 (格式: YYYY-MM-DD)'
)
parser.add_argument(
'--historical',
action='store_true',
help='爬取历史文章模式'
)
parser.add_argument(
'--list-websites', '-l',
action='store_true',
help='列出所有支持的网站'
)
parser.add_argument(
'--output', '-o',
type=str,
help='将结果保存到JSON文件'
)
def handle(self, *args, **options):
# 列出支持的网站
if options['list_websites']:
self.stdout.write(self.style.SUCCESS("支持的网站列表:"))
for i, website in enumerate(WEBSITE_SEARCH_CONFIGS.keys(), 1):
self.stdout.write(f"{i:2d}. {website}")
return
keyword = options['keyword']
if not keyword:
self.stdout.write(self.style.ERROR("必须指定 --keyword 参数"))
return
websites = options['websites']
max_pages = options['max_pages']
max_articles = options['max_articles']
start_date = options['start_date']
end_date = options['end_date']
historical = options['historical']
output_file = options['output']
# 验证网站名称
if websites:
invalid_websites = [w for w in websites if w not in WEBSITE_SEARCH_CONFIGS]
if invalid_websites:
self.stdout.write(
self.style.ERROR(f"不支持的网站: {', '.join(invalid_websites)}")
)
self.stdout.write("使用 --list-websites 查看支持的网站列表")
return
self.stdout.write(f"开始爬取任务...")
self.stdout.write(f"关键词: {keyword}")
if websites:
self.stdout.write(f"目标网站: {', '.join(websites)}")
else:
self.stdout.write(f"目标网站: 所有支持的网站 ({len(WEBSITE_SEARCH_CONFIGS)}个)")
if start_date:
self.stdout.write(f"开始日期: {start_date}")
if end_date:
self.stdout.write(f"结束日期: {end_date}")
self.stdout.write(f"最大页数: {max_pages}")
self.stdout.write(f"最大文章数: {max_articles}")
try:
if historical:
# 历史文章爬取模式
self.stdout.write(self.style.WARNING("使用历史文章爬取模式"))
results = crawl_historical_articles(
website_names=websites,
start_date=start_date,
end_date=end_date,
max_articles_per_site=max_articles
)
else:
# 关键词搜索模式
results = crawl_by_keyword(
keyword=keyword,
website_names=websites,
max_pages=max_pages,
start_date=start_date,
end_date=end_date,
max_articles=max_articles
)
# 显示结果摘要
self.stdout.write(self.style.SUCCESS("\n爬取完成!"))
self.stdout.write(f"总文章数: {results['total_articles']}")
self.stdout.write(f"成功: {results['success_count']}")
self.stdout.write(f"失败: {results['failed_count']}")
# 显示各网站详细结果
self.stdout.write("\n各网站结果:")
for website, result in results['website_results'].items():
status = self.style.SUCCESS if result['success'] > 0 else self.style.WARNING
self.stdout.write(
status(f" {website}: 找到 {result['found_urls']} 篇, "
f"成功 {result['success']}, 失败 {result['failed']}")
)
if 'error' in result:
self.stdout.write(self.style.ERROR(f" 错误: {result['error']}"))
# 保存结果到文件
if output_file:
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=2)
self.stdout.write(f"\n结果已保存到: {output_file}")
except Exception as e:
self.stdout.write(self.style.ERROR(f"爬取过程中出现错误: {e}"))
raise

View File

@@ -0,0 +1,45 @@
# Generated by Django 5.1 on 2025-09-23 19:28
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0001_initial'),
]
operations = [
migrations.CreateModel(
name='CrawlTask',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('name', models.CharField(max_length=200, verbose_name='任务名称')),
('task_type', models.CharField(choices=[('keyword', '关键词搜索'), ('historical', '历史文章'), ('full_site', '全站爬取')], default='keyword', max_length=20, verbose_name='任务类型')),
('keyword', models.CharField(blank=True, max_length=200, null=True, verbose_name='搜索关键词')),
('websites', models.JSONField(default=list, verbose_name='目标网站')),
('start_date', models.DateField(blank=True, null=True, verbose_name='开始日期')),
('end_date', models.DateField(blank=True, null=True, verbose_name='结束日期')),
('max_pages', models.IntegerField(default=10, verbose_name='最大页数')),
('max_articles', models.IntegerField(default=100, verbose_name='最大文章数')),
('status', models.CharField(choices=[('pending', '等待中'), ('running', '运行中'), ('completed', '已完成'), ('failed', '失败'), ('cancelled', '已取消')], default='pending', max_length=20, verbose_name='状态')),
('progress', models.IntegerField(default=0, verbose_name='进度百分比')),
('current_website', models.CharField(blank=True, max_length=100, null=True, verbose_name='当前网站')),
('current_action', models.CharField(blank=True, max_length=200, null=True, verbose_name='当前操作')),
('total_articles', models.IntegerField(default=0, verbose_name='总文章数')),
('success_count', models.IntegerField(default=0, verbose_name='成功数')),
('failed_count', models.IntegerField(default=0, verbose_name='失败数')),
('created_at', models.DateTimeField(auto_now_add=True, verbose_name='创建时间')),
('started_at', models.DateTimeField(blank=True, null=True, verbose_name='开始时间')),
('completed_at', models.DateTimeField(blank=True, null=True, verbose_name='完成时间')),
('error_message', models.TextField(blank=True, null=True, verbose_name='错误信息')),
('result_details', models.JSONField(blank=True, default=dict, verbose_name='结果详情')),
('created_by', models.CharField(blank=True, max_length=100, null=True, verbose_name='创建者')),
],
options={
'verbose_name': '爬取任务',
'verbose_name_plural': '爬取任务',
'ordering': ['-created_at'],
},
),
]

View File

@@ -0,0 +1,22 @@
# Generated by Django 5.1 on 2025-09-23 19:34
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0002_crawltask'),
]
operations = [
migrations.RemoveField(
model_name='crawltask',
name='websites',
),
migrations.AddField(
model_name='crawltask',
name='websites',
field=models.ManyToManyField(blank=True, to='core.website', verbose_name='目标网站'),
),
]

View File

@@ -1,4 +1,6 @@
from django.db import models from django.db import models
from django.utils import timezone
import json
class Website(models.Model): class Website(models.Model):
@@ -25,3 +27,93 @@ class Article(models.Model):
def __str__(self): def __str__(self):
return self.title return self.title
class CrawlTask(models.Model):
"""爬取任务模型"""
TASK_STATUS_CHOICES = [
('pending', '等待中'),
('running', '运行中'),
('completed', '已完成'),
('failed', '失败'),
('cancelled', '已取消'),
]
TASK_TYPE_CHOICES = [
('keyword', '关键词搜索'),
('historical', '历史文章'),
('full_site', '全站爬取'),
]
name = models.CharField(max_length=200, verbose_name="任务名称")
task_type = models.CharField(max_length=20, choices=TASK_TYPE_CHOICES, default='keyword', verbose_name="任务类型")
keyword = models.CharField(max_length=200, blank=True, null=True, verbose_name="搜索关键词")
websites = models.ManyToManyField(Website, blank=True, verbose_name="目标网站")
start_date = models.DateField(blank=True, null=True, verbose_name="开始日期")
end_date = models.DateField(blank=True, null=True, verbose_name="结束日期")
max_pages = models.IntegerField(default=10, verbose_name="最大页数")
max_articles = models.IntegerField(default=100, verbose_name="最大文章数")
status = models.CharField(max_length=20, choices=TASK_STATUS_CHOICES, default='pending', verbose_name="状态")
progress = models.IntegerField(default=0, verbose_name="进度百分比")
current_website = models.CharField(max_length=100, blank=True, null=True, verbose_name="当前网站")
current_action = models.CharField(max_length=200, blank=True, null=True, verbose_name="当前操作")
total_articles = models.IntegerField(default=0, verbose_name="总文章数")
success_count = models.IntegerField(default=0, verbose_name="成功数")
failed_count = models.IntegerField(default=0, verbose_name="失败数")
created_at = models.DateTimeField(auto_now_add=True, verbose_name="创建时间")
started_at = models.DateTimeField(blank=True, null=True, verbose_name="开始时间")
completed_at = models.DateTimeField(blank=True, null=True, verbose_name="完成时间")
error_message = models.TextField(blank=True, null=True, verbose_name="错误信息")
result_details = models.JSONField(default=dict, blank=True, verbose_name="结果详情")
created_by = models.CharField(max_length=100, blank=True, null=True, verbose_name="创建者")
class Meta:
verbose_name = "爬取任务"
verbose_name_plural = "爬取任务"
ordering = ['-created_at']
def __str__(self):
return f"{self.name} ({self.get_status_display()})"
def get_websites_display(self):
"""获取网站列表的显示文本"""
websites = self.websites.all()
if not websites:
return "所有网站"
return ", ".join([w.name for w in websites])
def get_duration(self):
"""获取任务执行时长"""
if not self.started_at:
return None
end_time = self.completed_at or timezone.now()
return end_time - self.started_at
def is_running(self):
"""判断任务是否正在运行"""
return self.status == 'running'
def can_cancel(self):
"""判断任务是否可以取消"""
return self.status in ['pending', 'running']
def get_progress_display(self):
"""获取进度显示文本"""
if self.status == 'pending':
return "等待开始"
elif self.status == 'running':
if self.current_website and self.current_action:
return f"正在处理 {self.current_website}: {self.current_action}"
return f"运行中 ({self.progress}%)"
elif self.status == 'completed':
return f"已完成 ({self.success_count}/{self.total_articles})"
elif self.status == 'failed':
return f"失败: {self.error_message[:50]}..." if self.error_message else "失败"
elif self.status == 'cancelled':
return "已取消"
return "未知状态"

View File

@@ -0,0 +1,84 @@
/**
* 爬取任务操作JavaScript
*/
function startTask(taskId) {
if (confirm('确定要启动这个任务吗?')) {
fetch(`/admin/core/crawltask/${taskId}/start/`, {
method: 'POST',
headers: {
'X-CSRFToken': getCookie('csrftoken'),
'Content-Type': 'application/x-www-form-urlencoded',
},
})
.then(response => {
if (response.ok) {
location.reload();
} else {
alert('启动任务失败');
}
})
.catch(error => {
console.error('Error:', error);
alert('启动任务失败');
});
}
}
function cancelTask(taskId) {
if (confirm('确定要取消这个任务吗?')) {
fetch(`/admin/core/crawltask/${taskId}/cancel/`, {
method: 'POST',
headers: {
'X-CSRFToken': getCookie('csrftoken'),
'Content-Type': 'application/x-www-form-urlencoded',
},
})
.then(response => {
if (response.ok) {
location.reload();
} else {
alert('取消任务失败');
}
})
.catch(error => {
console.error('Error:', error);
alert('取消任务失败');
});
}
}
function viewResults(taskId) {
window.open(`/admin/core/crawltask/${taskId}/results/`, '_blank');
}
function getCookie(name) {
let cookieValue = null;
if (document.cookie && document.cookie !== '') {
const cookies = document.cookie.split(';');
for (let i = 0; i < cookies.length; i++) {
const cookie = cookies[i].trim();
if (cookie.substring(0, name.length + 1) === (name + '=')) {
cookieValue = decodeURIComponent(cookie.substring(name.length + 1));
break;
}
}
}
return cookieValue;
}
// 自动刷新运行中的任务状态
function autoRefreshRunningTasks() {
const runningTasks = document.querySelectorAll('[data-task-status="running"]');
if (runningTasks.length > 0) {
// 每30秒刷新一次页面
setTimeout(() => {
location.reload();
}, 30000);
}
}
// 页面加载完成后执行
document.addEventListener('DOMContentLoaded', function() {
autoRefreshRunningTasks();
});

235
core/task_executor.py Normal file
View File

@@ -0,0 +1,235 @@
"""
爬取任务执行器
负责执行爬取任务并更新任务状态
"""
import threading
import time
from django.utils import timezone
from django.db import transaction
from core.models import CrawlTask
from core.utils import crawl_by_keyword, crawl_historical_articles, full_site_crawler, WEBSITE_SEARCH_CONFIGS
class TaskExecutor:
"""任务执行器"""
def __init__(self):
self.running_tasks = {}
self.lock = threading.Lock()
def start_task(self, task_id):
"""启动任务"""
with self.lock:
if task_id in self.running_tasks:
return False, "任务已在运行中"
try:
task = CrawlTask.objects.get(id=task_id)
if task.status != 'pending':
return False, "任务状态不允许启动"
# 更新任务状态
task.status = 'running'
task.started_at = timezone.now()
task.save()
# 启动后台线程执行任务
thread = threading.Thread(target=self._execute_task, args=(task_id,))
thread.daemon = True
thread.start()
self.running_tasks[task_id] = thread
return True, "任务已启动"
except CrawlTask.DoesNotExist:
return False, "任务不存在"
except Exception as e:
return False, f"启动任务失败: {e}"
def cancel_task(self, task_id):
"""取消任务"""
with self.lock:
if task_id in self.running_tasks:
# 标记任务为取消状态
try:
task = CrawlTask.objects.get(id=task_id)
task.status = 'cancelled'
task.completed_at = timezone.now()
task.save()
# 移除运行中的任务
del self.running_tasks[task_id]
return True, "任务已取消"
except CrawlTask.DoesNotExist:
return False, "任务不存在"
else:
return False, "任务未在运行中"
def _execute_task(self, task_id):
"""执行任务的核心逻辑"""
try:
task = CrawlTask.objects.get(id=task_id)
# 根据任务类型执行不同的爬取逻辑
if task.task_type == 'keyword':
self._execute_keyword_task(task)
elif task.task_type == 'historical':
self._execute_historical_task(task)
elif task.task_type == 'full_site':
self._execute_full_site_task(task)
else:
raise ValueError(f"不支持的任务类型: {task.task_type}")
# 任务完成
with transaction.atomic():
task = CrawlTask.objects.select_for_update().get(id=task_id)
task.status = 'completed'
task.completed_at = timezone.now()
task.progress = 100
task.save()
except Exception as e:
# 任务失败
try:
with transaction.atomic():
task = CrawlTask.objects.select_for_update().get(id=task_id)
task.status = 'failed'
task.completed_at = timezone.now()
task.error_message = str(e)
task.save()
except:
pass
finally:
# 清理运行中的任务记录
with self.lock:
if task_id in self.running_tasks:
del self.running_tasks[task_id]
def _execute_keyword_task(self, task):
"""执行关键词搜索任务"""
# 更新当前操作
task.current_action = "开始关键词搜索"
task.save()
# 准备参数
websites = task.websites if task.websites else list(WEBSITE_SEARCH_CONFIGS.keys())
start_date = task.start_date.strftime('%Y-%m-%d') if task.start_date else None
end_date = task.end_date.strftime('%Y-%m-%d') if task.end_date else None
# 执行爬取
results = crawl_by_keyword(
keyword=task.keyword,
website_names=websites,
max_pages=task.max_pages,
start_date=start_date,
end_date=end_date,
max_articles=task.max_articles
)
# 更新结果
task.total_articles = results['total_articles']
task.success_count = results['success_count']
task.failed_count = results['failed_count']
task.result_details = results['website_results']
task.save()
def _execute_historical_task(self, task):
"""执行历史文章任务"""
# 更新当前操作
task.current_action = "开始历史文章爬取"
task.save()
# 准备参数
websites = task.websites if task.websites else list(WEBSITE_SEARCH_CONFIGS.keys())
start_date = task.start_date.strftime('%Y-%m-%d') if task.start_date else None
end_date = task.end_date.strftime('%Y-%m-%d') if task.end_date else None
# 执行爬取
results = crawl_historical_articles(
website_names=websites,
start_date=start_date,
end_date=end_date,
max_articles_per_site=task.max_articles
)
# 更新结果
task.total_articles = results['total_articles']
task.success_count = results['success_count']
task.failed_count = results['failed_count']
task.result_details = results['website_results']
task.save()
def _execute_full_site_task(self, task):
"""执行全站爬取任务"""
# 更新当前操作
task.current_action = "开始全站爬取"
task.save()
# 准备参数
websites = task.websites if task.websites else list(WEBSITE_SEARCH_CONFIGS.keys())
total_websites = len(websites)
completed_websites = 0
for website_name in websites:
try:
# 更新当前网站
task.current_website = website_name
task.current_action = f"正在爬取 {website_name}"
task.save()
# 获取或创建网站对象
from core.models import Website
website, created = Website.objects.get_or_create(
name=website_name,
defaults={
'base_url': WEBSITE_SEARCH_CONFIGS[website_name]["search_url"],
'enabled': True
}
)
# 执行全站爬取
full_site_crawler(
WEBSITE_SEARCH_CONFIGS[website_name]["search_url"],
website,
max_pages=task.max_pages
)
completed_websites += 1
progress = int((completed_websites / total_websites) * 100)
task.progress = progress
task.save()
except Exception as e:
# 记录错误但继续处理其他网站
print(f"爬取网站 {website_name} 时出错: {e}")
continue
# 更新最终结果
task.total_articles = completed_websites # 这里可以改为实际爬取的文章数
task.success_count = completed_websites
task.failed_count = total_websites - completed_websites
task.save()
def get_task_status(self, task_id):
"""获取任务状态"""
try:
task = CrawlTask.objects.get(id=task_id)
return {
'status': task.status,
'progress': task.progress,
'current_website': task.current_website,
'current_action': task.current_action,
'total_articles': task.total_articles,
'success_count': task.success_count,
'failed_count': task.failed_count,
'error_message': task.error_message
}
except CrawlTask.DoesNotExist:
return None
# 全局任务执行器实例
task_executor = TaskExecutor()

View File

@@ -0,0 +1,139 @@
{% extends "admin/base_site.html" %}
{% load i18n admin_urls static admin_modify %}
{% block title %}{{ title }} | {{ site_title|default:_('Django site admin') }}{% endblock %}
{% block breadcrumbs %}
<div class="breadcrumbs">
<a href="{% url 'admin:index' %}">{% trans 'Home' %}</a>
&rsaquo; <a href="{% url 'admin:core_crawltask_changelist' %}">爬取任务</a>
&rsaquo; {{ title }}
</div>
{% endblock %}
{% block content %}
<h1>{{ title }}</h1>
<div class="help" style="background: #fff3cd; border: 1px solid #ffeaa7; padding: 15px; margin-bottom: 20px; border-radius: 5px;">
<strong>注意:</strong>全站爬取会爬取整个网站的所有文章,可能需要很长时间。建议在非高峰时段进行。
</div>
<form method="post" id="full-site-task-form">
{% csrf_token %}
<fieldset class="module aligned">
<h2>基本信息</h2>
<div class="form-row">
<div>
<label for="id_name" class="required">任务名称:</label>
<input type="text" name="name" id="id_name" required maxlength="200" style="width: 300px;">
<p class="help">为这个全站爬取任务起一个容易识别的名称</p>
</div>
</div>
</fieldset>
<fieldset class="module aligned">
<h2>目标网站</h2>
<div class="form-row">
<div>
<label>选择要爬取的网站:</label>
<div style="max-height: 200px; overflow-y: auto; border: 1px solid #ddd; padding: 10px; margin-top: 5px;">
<label style="display: block; margin: 5px 0;">
<input type="checkbox" id="select_all" onchange="toggleAllWebsites()">
<strong>全选/取消全选</strong>
</label>
<hr style="margin: 10px 0;">
{% for website in websites %}
<label style="display: block; margin: 3px 0;">
<input type="checkbox" name="websites" value="{{ website.name }}" class="website-checkbox">
{{ website.name }}
</label>
{% endfor %}
</div>
<p class="help">不选择任何网站将爬取所有支持的网站</p>
</div>
</div>
</fieldset>
<fieldset class="module aligned">
<h2>爬取设置</h2>
<div class="form-row">
<div>
<label for="id_max_pages">最大爬取页数:</label>
<input type="number" name="max_pages" id="id_max_pages" value="500" min="1" max="5000" style="width: 100px;">
<p class="help">每个网站最多爬取的页数 (1-5000)</p>
</div>
</div>
</fieldset>
<div class="submit-row">
<input type="submit" value="创建任务" class="default" name="_save">
<a href="{% url 'admin:core_crawltask_changelist' %}" class="button cancel-link">取消</a>
</div>
</form>
<script>
function toggleAllWebsites() {
const selectAll = document.getElementById('select_all');
const checkboxes = document.querySelectorAll('.website-checkbox');
checkboxes.forEach(checkbox => {
checkbox.checked = selectAll.checked;
});
}
</script>
<style>
.form-row {
margin-bottom: 15px;
}
.form-row label {
display: block;
font-weight: bold;
margin-bottom: 5px;
}
.form-row input[type="text"],
.form-row input[type="number"] {
padding: 5px;
border: 1px solid #ddd;
border-radius: 3px;
}
.form-row .help {
color: #666;
font-size: 12px;
margin-top: 3px;
}
.submit-row {
margin-top: 20px;
padding-top: 20px;
border-top: 1px solid #ddd;
}
.submit-row input[type="submit"] {
background: #417690;
color: white;
padding: 10px 20px;
border: none;
border-radius: 3px;
cursor: pointer;
}
.submit-row .cancel-link {
margin-left: 10px;
padding: 10px 20px;
background: #f8f8f8;
color: #333;
text-decoration: none;
border-radius: 3px;
border: 1px solid #ddd;
}
.submit-row .cancel-link:hover {
background: #e8e8e8;
}
</style>
{% endblock %}

View File

@@ -0,0 +1,164 @@
{% extends "admin/base_site.html" %}
{% load i18n admin_urls static admin_modify %}
{% block title %}{{ title }} | {{ site_title|default:_('Django site admin') }}{% endblock %}
{% block breadcrumbs %}
<div class="breadcrumbs">
<a href="{% url 'admin:index' %}">{% trans 'Home' %}</a>
&rsaquo; <a href="{% url 'admin:core_crawltask_changelist' %}">爬取任务</a>
&rsaquo; {{ title }}
</div>
{% endblock %}
{% block content %}
<h1>{{ title }}</h1>
<form method="post" id="historical-task-form">
{% csrf_token %}
<fieldset class="module aligned">
<h2>基本信息</h2>
<div class="form-row">
<div>
<label for="id_name" class="required">任务名称:</label>
<input type="text" name="name" id="id_name" required maxlength="200" style="width: 300px;">
<p class="help">为这个历史文章爬取任务起一个容易识别的名称</p>
</div>
</div>
</fieldset>
<fieldset class="module aligned">
<h2>目标网站</h2>
<div class="form-row">
<div>
<label>选择要爬取的网站:</label>
<div style="max-height: 200px; overflow-y: auto; border: 1px solid #ddd; padding: 10px; margin-top: 5px;">
<label style="display: block; margin: 5px 0;">
<input type="checkbox" id="select_all" onchange="toggleAllWebsites()">
<strong>全选/取消全选</strong>
</label>
<hr style="margin: 10px 0;">
{% for website in websites %}
<label style="display: block; margin: 3px 0;">
<input type="checkbox" name="websites" value="{{ website.name }}" class="website-checkbox">
{{ website.name }}
</label>
{% endfor %}
</div>
<p class="help">不选择任何网站将爬取所有支持的网站</p>
</div>
</div>
</fieldset>
<fieldset class="module aligned">
<h2>时间范围</h2>
<div class="form-row">
<div>
<label for="id_start_date" class="required">开始日期:</label>
<input type="date" name="start_date" id="id_start_date" required>
<p class="help">历史文章的开始日期</p>
</div>
</div>
<div class="form-row">
<div>
<label for="id_end_date" class="required">结束日期:</label>
<input type="date" name="end_date" id="id_end_date" required>
<p class="help">历史文章的结束日期</p>
</div>
</div>
</fieldset>
<fieldset class="module aligned">
<h2>爬取设置</h2>
<div class="form-row">
<div>
<label for="id_max_articles">每个网站最大文章数:</label>
<input type="number" name="max_articles" id="id_max_articles" value="50" min="1" max="500" style="width: 100px;">
<p class="help">每个网站最多爬取的文章数量 (1-500)</p>
</div>
</div>
</fieldset>
<div class="submit-row">
<input type="submit" value="创建任务" class="default" name="_save">
<a href="{% url 'admin:core_crawltask_changelist' %}" class="button cancel-link">取消</a>
</div>
</form>
<script>
function toggleAllWebsites() {
const selectAll = document.getElementById('select_all');
const checkboxes = document.querySelectorAll('.website-checkbox');
checkboxes.forEach(checkbox => {
checkbox.checked = selectAll.checked;
});
}
// 设置默认日期
document.addEventListener('DOMContentLoaded', function() {
const today = new Date();
const oneMonthAgo = new Date(today.getFullYear(), today.getMonth() - 1, today.getDate());
document.getElementById('id_end_date').value = today.toISOString().split('T')[0];
document.getElementById('id_start_date').value = oneMonthAgo.toISOString().split('T')[0];
});
</script>
<style>
.form-row {
margin-bottom: 15px;
}
.form-row label {
display: block;
font-weight: bold;
margin-bottom: 5px;
}
.form-row input[type="text"],
.form-row input[type="number"],
.form-row input[type="date"] {
padding: 5px;
border: 1px solid #ddd;
border-radius: 3px;
}
.form-row .help {
color: #666;
font-size: 12px;
margin-top: 3px;
}
.submit-row {
margin-top: 20px;
padding-top: 20px;
border-top: 1px solid #ddd;
}
.submit-row input[type="submit"] {
background: #417690;
color: white;
padding: 10px 20px;
border: none;
border-radius: 3px;
cursor: pointer;
}
.submit-row .cancel-link {
margin-left: 10px;
padding: 10px 20px;
background: #f8f8f8;
color: #333;
text-decoration: none;
border-radius: 3px;
border: 1px solid #ddd;
}
.submit-row .cancel-link:hover {
background: #e8e8e8;
}
</style>
{% endblock %}

View File

@@ -0,0 +1,180 @@
{% extends "admin/base_site.html" %}
{% load i18n admin_urls static admin_modify %}
{% block title %}{{ title }} | {{ site_title|default:_('Django site admin') }}{% endblock %}
{% block breadcrumbs %}
<div class="breadcrumbs">
<a href="{% url 'admin:index' %}">{% trans 'Home' %}</a>
&rsaquo; <a href="{% url 'admin:core_crawltask_changelist' %}">爬取任务</a>
&rsaquo; {{ title }}
</div>
{% endblock %}
{% block content %}
<h1>{{ title }}</h1>
<form method="post" id="keyword-task-form">
{% csrf_token %}
<fieldset class="module aligned">
<h2>基本信息</h2>
<div class="form-row">
<div>
<label for="id_name" class="required">任务名称:</label>
<input type="text" name="name" id="id_name" required maxlength="200" style="width: 300px;">
<p class="help">为这个爬取任务起一个容易识别的名称</p>
</div>
</div>
<div class="form-row">
<div>
<label for="id_keyword" class="required">搜索关键词:</label>
<input type="text" name="keyword" id="id_keyword" required maxlength="200" style="width: 300px;">
<p class="help">输入要搜索的关键词,例如:人工智能、两会、政策等</p>
</div>
</div>
</fieldset>
<fieldset class="module aligned">
<h2>目标网站</h2>
<div class="form-row">
<div>
<label>选择要爬取的网站:</label>
<div style="max-height: 200px; overflow-y: auto; border: 1px solid #ddd; padding: 10px; margin-top: 5px;">
<label style="display: block; margin: 5px 0;">
<input type="checkbox" id="select_all" onchange="toggleAllWebsites()">
<strong>全选/取消全选</strong>
</label>
<hr style="margin: 10px 0;">
{% for website in websites %}
<label style="display: block; margin: 3px 0;">
<input type="checkbox" name="websites" value="{{ website.name }}" class="website-checkbox">
{{ website.name }}
</label>
{% endfor %}
</div>
<p class="help">不选择任何网站将爬取所有支持的网站</p>
</div>
</div>
</fieldset>
<fieldset class="module aligned">
<h2>时间范围 (可选)</h2>
<div class="form-row">
<div>
<label for="id_start_date">开始日期:</label>
<input type="date" name="start_date" id="id_start_date">
<p class="help">留空则搜索所有时间</p>
</div>
</div>
<div class="form-row">
<div>
<label for="id_end_date">结束日期:</label>
<input type="date" name="end_date" id="id_end_date">
<p class="help">留空则搜索到当前时间</p>
</div>
</div>
</fieldset>
<fieldset class="module aligned">
<h2>爬取设置</h2>
<div class="form-row">
<div>
<label for="id_max_pages">最大搜索页数:</label>
<input type="number" name="max_pages" id="id_max_pages" value="10" min="1" max="100" style="width: 100px;">
<p class="help">每个网站最多搜索的页数 (1-100)</p>
</div>
</div>
<div class="form-row">
<div>
<label for="id_max_articles">最大文章数量:</label>
<input type="number" name="max_articles" id="id_max_articles" value="100" min="1" max="1000" style="width: 100px;">
<p class="help">总共最多爬取的文章数量 (1-1000)</p>
</div>
</div>
</fieldset>
<div class="submit-row">
<input type="submit" value="创建任务" class="default" name="_save">
<a href="{% url 'admin:core_crawltask_changelist' %}" class="button cancel-link">取消</a>
</div>
</form>
<script>
function toggleAllWebsites() {
const selectAll = document.getElementById('select_all');
const checkboxes = document.querySelectorAll('.website-checkbox');
checkboxes.forEach(checkbox => {
checkbox.checked = selectAll.checked;
});
}
// 设置默认日期
document.addEventListener('DOMContentLoaded', function() {
const today = new Date();
const oneMonthAgo = new Date(today.getFullYear(), today.getMonth() - 1, today.getDate());
document.getElementById('id_end_date').value = today.toISOString().split('T')[0];
document.getElementById('id_start_date').value = oneMonthAgo.toISOString().split('T')[0];
});
</script>
<style>
.form-row {
margin-bottom: 15px;
}
.form-row label {
display: block;
font-weight: bold;
margin-bottom: 5px;
}
.form-row input[type="text"],
.form-row input[type="number"],
.form-row input[type="date"] {
padding: 5px;
border: 1px solid #ddd;
border-radius: 3px;
}
.form-row .help {
color: #666;
font-size: 12px;
margin-top: 3px;
}
.submit-row {
margin-top: 20px;
padding-top: 20px;
border-top: 1px solid #ddd;
}
.submit-row input[type="submit"] {
background: #417690;
color: white;
padding: 10px 20px;
border: none;
border-radius: 3px;
cursor: pointer;
}
.submit-row .cancel-link {
margin-left: 10px;
padding: 10px 20px;
background: #f8f8f8;
color: #333;
text-decoration: none;
border-radius: 3px;
border: 1px solid #ddd;
}
.submit-row .cancel-link:hover {
background: #e8e8e8;
}
</style>
{% endblock %}

View File

@@ -0,0 +1,172 @@
{% extends "admin/base_site.html" %}
{% load i18n static %}
{% block extrastyle %}{{ block.super }}<link rel="stylesheet" type="text/css" href="{% static "admin/css/dashboard.css" %}">{% endblock %}
{% block coltype %}colMS{% endblock %}
{% block bodyclass %}{{ block.super }} dashboard{% endblock %}
{% block breadcrumbs %}{% endblock %}
{% block nav-sidebar %}{% endblock %}
{% block content %}
<div id="content-main">
{% if app_list %}
{% for app in app_list %}
<div class="app-{{ app.app_label }} module">
<table>
<caption>
<a href="{{ app.app_url }}" class="section" title="{% blocktranslate with name=app.name %}Models in the {{ name }} application{% endblocktranslate %}">{{ app.name }}</a>
</caption>
{% for model in app.models %}
<tr class="model-{{ model.object_name|lower }}">
{% if model.admin_url %}
<th scope="row"><a href="{{ model.admin_url }}"{% if model.add_url %} class="addlink"{% endif %}>{{ model.name }}</a></th>
{% else %}
<th scope="row">{{ model.name }}</th>
{% endif %}
{% if model.add_url %}
<td><a href="{{ model.add_url }}" class="addlink">{% translate 'Add' %}</a></td>
{% else %}
<td>&nbsp;</td>
{% endif %}
{% if model.admin_url %}
{% if model.view_only %}
<td><a href="{{ model.admin_url }}" class="viewlink">{% translate 'View' %}</a></td>
{% else %}
<td><a href="{{ model.admin_url }}" class="changelink">{% translate 'Change' %}</a></td>
{% endif %}
{% else %}
<td>&nbsp;</td>
{% endif %}
</tr>
{% endfor %}
</table>
</div>
{% endfor %}
{% else %}
<p>{% translate "You don't have permission to view or edit anything." %}</p>
{% endif %}
<!-- 自定义快速操作区域 -->
<div class="module" style="margin-top: 20px;">
<h2>快速创建爬取任务</h2>
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 15px; margin-top: 15px;">
<div style="border: 1px solid #ddd; padding: 15px; border-radius: 5px; text-align: center;">
<h3 style="margin-top: 0; color: #417690;">关键词搜索</h3>
<p style="color: #666; font-size: 14px;">根据关键词搜索并爬取相关文章</p>
<a href="{% url 'admin:create_keyword_task' %}" class="button" style="background: #417690; color: white; padding: 8px 16px; text-decoration: none; border-radius: 3px; display: inline-block;">
创建任务
</a>
</div>
<div style="border: 1px solid #ddd; padding: 15px; border-radius: 5px; text-align: center;">
<h3 style="margin-top: 0; color: #28a745;">历史文章</h3>
<p style="color: #666; font-size: 14px;">爬取指定日期范围的历史文章</p>
<a href="{% url 'admin:create_historical_task' %}" class="button" style="background: #28a745; color: white; padding: 8px 16px; text-decoration: none; border-radius: 3px; display: inline-block;">
创建任务
</a>
</div>
<div style="border: 1px solid #ddd; padding: 15px; border-radius: 5px; text-align: center;">
<h3 style="margin-top: 0; color: #dc3545;">全站爬取</h3>
<p style="color: #666; font-size: 14px;">爬取整个网站的所有文章</p>
<a href="{% url 'admin:create_full_site_task' %}" class="button" style="background: #dc3545; color: white; padding: 8px 16px; text-decoration: none; border-radius: 3px; display: inline-block;">
创建任务
</a>
</div>
</div>
</div>
<!-- 最近任务状态 -->
<div class="module" style="margin-top: 20px;">
<h2>最近任务状态</h2>
<div style="margin-top: 15px;">
{% load core_extras %}
{% get_recent_tasks as recent_tasks %}
{% if recent_tasks %}
<table style="width: 100%;">
<thead>
<tr style="background: #f8f9fa;">
<th style="padding: 8px; text-align: left;">任务名称</th>
<th style="padding: 8px; text-align: left;">类型</th>
<th style="padding: 8px; text-align: left;">状态</th>
<th style="padding: 8px; text-align: left;">进度</th>
<th style="padding: 8px; text-align: left;">创建时间</th>
<th style="padding: 8px; text-align: left;">操作</th>
</tr>
</thead>
<tbody>
{% for task in recent_tasks %}
<tr>
<td style="padding: 8px;">{{ task.name }}</td>
<td style="padding: 8px;">{{ task.get_task_type_display }}</td>
<td style="padding: 8px;">
<span style="color: {% if task.status == 'completed' %}green{% elif task.status == 'failed' %}red{% elif task.status == 'running' %}blue{% else %}gray{% endif %};">
{{ task.get_status_display }}
</span>
</td>
<td style="padding: 8px;">
{% if task.status == 'running' %}
<div style="width: 100px; background-color: #f0f0f0; border-radius: 3px; overflow: hidden;">
<div style="width: {{ task.progress }}%; background-color: #4CAF50; height: 16px; text-align: center; line-height: 16px; color: white; font-size: 12px;">
{{ task.progress }}%
</div>
</div>
{% else %}
-
{% endif %}
</td>
<td style="padding: 8px;">{{ task.created_at|date:"m-d H:i" }}</td>
<td style="padding: 8px;">
<a href="{% url 'admin:core_crawltask_change' task.id %}" style="color: #417690; text-decoration: none;">查看</a>
</td>
</tr>
{% endfor %}
</tbody>
</table>
{% else %}
<p style="color: #666; text-align: center; padding: 20px;">暂无任务</p>
{% endif %}
</div>
</div>
</div>
{% endblock %}
{% block sidebar %}
<div id="content-related">
<div class="module" id="recent-actions-module">
<h2>{% translate 'Recent actions' %}</h2>
<h3>{% translate 'My actions' %}</h3>
{% load log %}
{% get_admin_log 10 as admin_log for_user user %}
{% if not admin_log %}
<p>{% translate 'None available' %}</p>
{% else %}
<ul class="actionlist">
{% for entry in admin_log %}
<li class="{% if entry.is_addition %}addlink{% endif %}{% if entry.is_change %}changelink{% endif %}{% if entry.is_deletion %}deletelink{% endif %}">
{% if entry.is_deletion or not entry.get_admin_url %}
{{ entry.object_repr }}
{% else %}
<a href="{{ entry.get_admin_url }}">{{ entry.object_repr }}</a>
{% endif %}
<br>
{% if entry.content_type %}
<span class="mini quiet">{% filter capfirst %}{{ entry.content_type.name }}{% endfilter %}</span>
{% else %}
<span class="mini quiet">{% translate 'Unknown content' %}</span>
{% endif %}
</li>
{% endfor %}
</ul>
{% endif %}
</div>
</div>
{% endblock %}

View File

@@ -0,0 +1,184 @@
{% extends "admin/base_site.html" %}
{% load i18n admin_urls static admin_modify %}
{% block title %}{{ title }} | {{ site_title|default:_('Django site admin') }}{% endblock %}
{% block breadcrumbs %}
<div class="breadcrumbs">
<a href="{% url 'admin:index' %}">{% trans 'Home' %}</a>
&rsaquo; <a href="{% url 'admin:core_crawltask_changelist' %}">爬取任务</a>
&rsaquo; {{ title }}
</div>
{% endblock %}
{% block content %}
<h1>{{ title }}</h1>
<div class="results-summary" style="background: #f8f9fa; border: 1px solid #dee2e6; padding: 20px; margin-bottom: 20px; border-radius: 5px;">
<h2>任务概览</h2>
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px;">
<div>
<strong>任务名称:</strong><br>
{{ task.name }}
</div>
<div>
<strong>任务类型:</strong><br>
{{ task.get_task_type_display }}
</div>
<div>
<strong>状态:</strong><br>
<span style="color: {% if task.status == 'completed' %}green{% elif task.status == 'failed' %}red{% elif task.status == 'running' %}blue{% else %}gray{% endif %};">
{{ task.get_status_display }}
</span>
</div>
<div>
<strong>创建时间:</strong><br>
{{ task.created_at|date:"Y-m-d H:i:s" }}
</div>
{% if task.started_at %}
<div>
<strong>开始时间:</strong><br>
{{ task.started_at|date:"Y-m-d H:i:s" }}
</div>
{% endif %}
{% if task.completed_at %}
<div>
<strong>完成时间:</strong><br>
{{ task.completed_at|date:"Y-m-d H:i:s" }}
</div>
{% endif %}
{% if task.get_duration %}
<div>
<strong>执行时长:</strong><br>
{{ task.duration_display }}
</div>
{% endif %}
</div>
</div>
<div class="results-stats" style="background: #fff; border: 1px solid #dee2e6; padding: 20px; margin-bottom: 20px; border-radius: 5px;">
<h2>统计信息</h2>
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); gap: 15px;">
<div style="text-align: center; padding: 15px; background: #e3f2fd; border-radius: 5px;">
<div style="font-size: 24px; font-weight: bold; color: #1976d2;">{{ task.total_articles }}</div>
<div>总文章数</div>
</div>
<div style="text-align: center; padding: 15px; background: #e8f5e8; border-radius: 5px;">
<div style="font-size: 24px; font-weight: bold; color: #388e3c;">{{ task.success_count }}</div>
<div>成功数</div>
</div>
<div style="text-align: center; padding: 15px; background: #ffebee; border-radius: 5px;">
<div style="font-size: 24px; font-weight: bold; color: #d32f2f;">{{ task.failed_count }}</div>
<div>失败数</div>
</div>
{% if task.total_articles > 0 %}
<div style="text-align: center; padding: 15px; background: #fff3e0; border-radius: 5px;">
<div style="font-size: 24px; font-weight: bold; color: #f57c00;">
{% widthratio task.success_count task.total_articles 100 %}%
</div>
<div>成功率</div>
</div>
{% endif %}
</div>
</div>
{% if task.keyword %}
<div class="task-config" style="background: #fff; border: 1px solid #dee2e6; padding: 20px; margin-bottom: 20px; border-radius: 5px;">
<h2>任务配置</h2>
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px;">
<div>
<strong>搜索关键词:</strong><br>
{{ task.keyword }}
</div>
<div>
<strong>目标网站:</strong><br>
{{ task.get_websites_display }}
</div>
{% if task.start_date %}
<div>
<strong>开始日期:</strong><br>
{{ task.start_date }}
</div>
{% endif %}
{% if task.end_date %}
<div>
<strong>结束日期:</strong><br>
{{ task.end_date }}
</div>
{% endif %}
<div>
<strong>最大页数:</strong><br>
{{ task.max_pages }}
</div>
<div>
<strong>最大文章数:</strong><br>
{{ task.max_articles }}
</div>
</div>
</div>
{% endif %}
{% if task.current_website or task.current_action %}
<div class="current-status" style="background: #fff; border: 1px solid #dee2e6; padding: 20px; margin-bottom: 20px; border-radius: 5px;">
<h2>当前状态</h2>
{% if task.current_website %}
<div>
<strong>当前网站:</strong> {{ task.current_website }}
</div>
{% endif %}
{% if task.current_action %}
<div>
<strong>当前操作:</strong> {{ task.current_action }}
</div>
{% endif %}
{% if task.status == 'running' %}
<div style="margin-top: 10px;">
<div style="width: 100%; background-color: #f0f0f0; border-radius: 10px; overflow: hidden;">
<div style="width: {{ task.progress }}%; background-color: #4CAF50; height: 20px; text-align: center; line-height: 20px; color: white;">
{{ task.progress }}%
</div>
</div>
</div>
{% endif %}
</div>
{% endif %}
{% if task.error_message %}
<div class="error-info" style="background: #ffebee; border: 1px solid #f44336; padding: 20px; margin-bottom: 20px; border-radius: 5px;">
<h2 style="color: #d32f2f;">错误信息</h2>
<pre style="white-space: pre-wrap; word-wrap: break-word;">{{ task.error_message }}</pre>
</div>
{% endif %}
{% if task.result_details %}
<div class="detailed-results" style="background: #fff; border: 1px solid #dee2e6; padding: 20px; margin-bottom: 20px; border-radius: 5px;">
<h2>详细结果</h2>
{% for website, result in task.result_details.items %}
<div style="margin-bottom: 15px; padding: 10px; background: #f8f9fa; border-radius: 3px;">
<strong>{{ website }}:</strong>
<ul style="margin: 5px 0; padding-left: 20px;">
<li>找到链接: {{ result.found_urls }}</li>
<li>已处理: {{ result.processed }}</li>
<li>成功: {{ result.success }}</li>
<li>失败: {{ result.failed }}</li>
{% if result.error %}
<li style="color: red;">错误: {{ result.error }}</li>
{% endif %}
</ul>
</div>
{% endfor %}
</div>
{% endif %}
<div class="actions" style="text-align: center; margin-top: 30px;">
<a href="{% url 'admin:core_crawltask_changelist' %}" class="button" style="padding: 10px 20px; background: #417690; color: white; text-decoration: none; border-radius: 3px; margin-right: 10px;">
返回任务列表
</a>
{% if task.status == 'completed' %}
<a href="{% url 'admin:core_article_changelist' %}" class="button" style="padding: 10px 20px; background: #28a745; color: white; text-decoration: none; border-radius: 3px;">
查看文章
</a>
{% endif %}
</div>
{% endblock %}

View File

View File

@@ -0,0 +1,46 @@
from django import template
from django.core.cache import cache
from core.models import CrawlTask
register = template.Library()
@register.simple_tag
def get_recent_tasks(limit=5):
"""获取最近的任务"""
cache_key = f'recent_tasks_{limit}'
recent_tasks = cache.get(cache_key)
if recent_tasks is None:
recent_tasks = CrawlTask.objects.all()[:limit]
cache.set(cache_key, recent_tasks, 60) # 缓存1分钟
return recent_tasks
@register.filter
def task_status_color(status):
"""根据任务状态返回颜色"""
color_map = {
'pending': 'gray',
'running': 'blue',
'completed': 'green',
'failed': 'red',
'cancelled': 'orange',
}
return color_map.get(status, 'gray')
@register.filter
def task_progress_bar(progress):
"""生成进度条HTML"""
if progress is None:
progress = 0
return f'''
<div style="width: 100px; background-color: #f0f0f0; border-radius: 3px; overflow: hidden;">
<div style="width: {progress}%; background-color: #4CAF50; height: 16px; text-align: center; line-height: 16px; color: white; font-size: 12px;">
{progress}%
</div>
</div>
'''

View File

@@ -1,7 +1,7 @@
import os import os
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse from urllib.parse import urljoin, urlparse, quote
from collections import deque from collections import deque
from django.utils import timezone from django.utils import timezone
from django.conf import settings from django.conf import settings
@@ -15,6 +15,8 @@ from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager from webdriver_manager.chrome import ChromeDriverManager
from datetime import datetime, timedelta
import json
def get_selenium_driver(): def get_selenium_driver():
@@ -2270,3 +2272,695 @@ def full_site_crawler(start_url, website, max_pages=1000):
queue.append(href) queue.append(href)
elif href not in visited and is_valid_url(href, base_netloc): elif href not in visited and is_valid_url(href, base_netloc):
queue.append(href) queue.append(href)
# 网站搜索配置
WEBSITE_SEARCH_CONFIGS = {
"新华网": {
"search_url": "http://so.news.cn/getNews",
"search_params": {
"keyword": "{keyword}",
"curPage": "{page}",
"sortField": "0",
"sortType": "1"
},
"method": "post",
"headers": {
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}
},
"人民日报": {
"search_url": "http://search.people.com.cn/search",
"search_params": {
"keyword": "{keyword}",
"st": "0",
"startDate": "{start_date}",
"endDate": "{end_date}",
"page": "{page}"
},
"method": "get"
},
"央视网": {
"search_url": "https://search.cctv.com/search.php",
"search_params": {
"qtext": "{keyword}",
"type": "web",
"page": "{page}"
},
"method": "get"
},
"光明日报": {
"search_url": "http://search.gmw.cn/search",
"search_params": {
"q": "{keyword}",
"page": "{page}"
},
"method": "get"
},
"经济日报": {
"search_url": "http://www.ce.cn/search",
"search_params": {
"keyword": "{keyword}",
"page": "{page}"
},
"method": "get"
},
"中国日报": {
"search_url": "http://www.chinadaily.com.cn/search",
"search_params": {
"q": "{keyword}",
"page": "{page}"
},
"method": "get"
},
"工人日报": {
"search_url": "https://www.workercn.cn/search",
"search_params": {
"keyword": "{keyword}",
"page": "{page}"
},
"method": "get"
},
"科技日报": {
"search_url": "http://www.stdaily.com/search",
"search_params": {
"keyword": "{keyword}",
"page": "{page}"
},
"method": "get"
},
"人民政协网": {
"search_url": "https://www.rmzxw.com.cn/search",
"search_params": {
"keyword": "{keyword}",
"page": "{page}"
},
"method": "get"
},
"中国纪检监察报": {
"search_url": "http://www.jjjcb.cn/search",
"search_params": {
"keyword": "{keyword}",
"page": "{page}"
},
"method": "get"
},
"中国新闻社": {
"search_url": "https://www.chinanews.com.cn/search",
"search_params": {
"keyword": "{keyword}",
"page": "{page}"
},
"method": "get"
},
"学习时报": {
"search_url": "https://www.studytimes.cn/search",
"search_params": {
"keyword": "{keyword}",
"page": "{page}"
},
"method": "get"
},
"中国青年报": {
"search_url": "http://news.cyol.com/search",
"search_params": {
"keyword": "{keyword}",
"page": "{page}"
},
"method": "get"
},
"中国妇女报": {
"search_url": "https://www.cnwomen.com.cn/search",
"search_params": {
"keyword": "{keyword}",
"page": "{page}"
},
"method": "get"
},
"法治日报": {
"search_url": "http://www.legaldaily.com.cn/search",
"search_params": {
"keyword": "{keyword}",
"page": "{page}"
},
"method": "get"
},
"农民日报": {
"search_url": "https://www.farmer.com.cn/search",
"search_params": {
"keyword": "{keyword}",
"page": "{page}"
},
"method": "get"
},
"学习强国": {
"search_url": "https://www.xuexi.cn/search",
"search_params": {
"keyword": "{keyword}",
"page": "{page}"
},
"method": "get"
},
"旗帜网": {
"search_url": "http://www.qizhiwang.org.cn/search",
"search_params": {
"keyword": "{keyword}",
"page": "{page}"
},
"method": "get"
},
"中国网": {
"search_url": "http://www.china.com.cn/search",
"search_params": {
"keyword": "{keyword}",
"page": "{page}"
},
"method": "get"
},
"中国政府网": {
"search_url": "https://www.gov.cn/search",
"search_params": {
"keyword": "{keyword}",
"page": "{page}"
},
"method": "get"
},
"求是网": {
"search_url": "http://www.qstheory.cn/search",
"search_params": {
"keyword": "{keyword}",
"page": "{page}"
},
"method": "get"
},
"解放军报": {
"search_url": "http://www.81.cn/search",
"search_params": {
"keyword": "{keyword}",
"page": "{page}"
},
"method": "get"
}
}
def search_articles_by_keyword(website_name, keyword, max_pages=10, start_date=None, end_date=None):
"""
根据关键词搜索文章
Args:
website_name: 网站名称
keyword: 搜索关键词
max_pages: 最大搜索页数
start_date: 开始日期 (YYYY-MM-DD)
end_date: 结束日期 (YYYY-MM-DD)
Returns:
list: 搜索到的文章URL列表
"""
if website_name not in WEBSITE_SEARCH_CONFIGS:
print(f"网站 {website_name} 不支持搜索功能")
return []
config = WEBSITE_SEARCH_CONFIGS[website_name]
article_urls = []
# 设置默认日期范围
if not start_date:
start_date = (datetime.now() - timedelta(days=365)).strftime("%Y-%m-%d")
if not end_date:
end_date = datetime.now().strftime("%Y-%m-%d")
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
headers.update(config.get("headers", {}))
for page in range(1, max_pages + 1):
try:
# 构建搜索参数
search_params = {}
for key, value in config["search_params"].items():
search_params[key] = value.format(
keyword=quote(keyword),
page=page,
start_date=start_date,
end_date=end_date
)
print(f"搜索 {website_name}{page} 页: {keyword}")
if config["method"] == "post":
response = requests.post(
config["search_url"],
data=search_params,
headers=headers,
timeout=15
)
else:
response = requests.get(
config["search_url"],
params=search_params,
headers=headers,
timeout=15
)
response.raise_for_status()
response.encoding = 'utf-8'
# 解析搜索结果
soup = BeautifulSoup(response.text, "html.parser")
page_urls = extract_search_results(soup, website_name)
if not page_urls:
print(f"{page} 页没有找到更多结果")
break
article_urls.extend(page_urls)
print(f"{page} 页找到 {len(page_urls)} 篇文章")
# 避免请求过快
time.sleep(1)
except Exception as e:
print(f"搜索第 {page} 页时出错: {e}")
continue
print(f"总共找到 {len(article_urls)} 篇文章")
return article_urls
def extract_search_results(soup, website_name):
"""
从搜索结果页面提取文章链接
Args:
soup: BeautifulSoup对象
website_name: 网站名称
Returns:
list: 文章URL列表
"""
urls = []
# 根据不同网站的搜索结果结构提取链接
if website_name == "新华网":
# 新华网搜索结果结构
for link in soup.find_all("a", href=True):
href = link["href"]
if "/news/" in href or "/article/" in href:
urls.append(href)
elif website_name == "人民日报":
# 人民日报搜索结果结构
for link in soup.find_all("a", href=True):
href = link["href"]
if "/n1/" in href or "/article/" in href:
urls.append(href)
elif website_name == "央视网":
# 央视网搜索结果结构
for link in soup.find_all("a", href=True):
href = link["href"]
if "/news/" in href or "ARTI" in href:
urls.append(href)
elif website_name == "光明日报":
# 光明日报搜索结果结构
for link in soup.find_all("a", href=True):
href = link["href"]
if "/article/" in href or "/content/" in href:
urls.append(href)
elif website_name == "经济日报":
# 经济日报搜索结果结构
for link in soup.find_all("a", href=True):
href = link["href"]
if "/article/" in href or "/content/" in href:
urls.append(href)
elif website_name == "中国日报":
# 中国日报搜索结果结构
for link in soup.find_all("a", href=True):
href = link["href"]
if "/article/" in href or "/content/" in href:
urls.append(href)
elif website_name == "工人日报":
# 工人日报搜索结果结构
for link in soup.find_all("a", href=True):
href = link["href"]
if "/c/" in href or "/article/" in href:
urls.append(href)
elif website_name == "科技日报":
# 科技日报搜索结果结构
for link in soup.find_all("a", href=True):
href = link["href"]
if "/article/" in href or "/content/" in href:
urls.append(href)
elif website_name == "人民政协网":
# 人民政协网搜索结果结构
for link in soup.find_all("a", href=True):
href = link["href"]
if "/article/" in href or "/content/" in href:
urls.append(href)
elif website_name == "中国纪检监察报":
# 中国纪检监察报搜索结果结构
for link in soup.find_all("a", href=True):
href = link["href"]
if "/article/" in href or "/content/" in href:
urls.append(href)
elif website_name == "中国新闻社":
# 中国新闻社搜索结果结构
for link in soup.find_all("a", href=True):
href = link["href"]
if "/article/" in href or "/content/" in href:
urls.append(href)
elif website_name == "学习时报":
# 学习时报搜索结果结构
for link in soup.find_all("a", href=True):
href = link["href"]
if "/article/" in href or "/content/" in href:
urls.append(href)
elif website_name == "中国青年报":
# 中国青年报搜索结果结构
for link in soup.find_all("a", href=True):
href = link["href"]
if "/article/" in href or "/content/" in href:
urls.append(href)
elif website_name == "中国妇女报":
# 中国妇女报搜索结果结构
for link in soup.find_all("a", href=True):
href = link["href"]
if "/article/" in href or "/content/" in href:
urls.append(href)
elif website_name == "法治日报":
# 法治日报搜索结果结构
for link in soup.find_all("a", href=True):
href = link["href"]
if "/content/" in href and "content_" in href:
urls.append(href)
elif website_name == "农民日报":
# 农民日报搜索结果结构
for link in soup.find_all("a", href=True):
href = link["href"]
if "/article/" in href or "/content/" in href:
urls.append(href)
elif website_name == "学习强国":
# 学习强国搜索结果结构
for link in soup.find_all("a", href=True):
href = link["href"]
if "/article/" in href or "/content/" in href:
urls.append(href)
elif website_name == "旗帜网":
# 旗帜网搜索结果结构
for link in soup.find_all("a", href=True):
href = link["href"]
if "/n1/" in href or "/article/" in href:
urls.append(href)
elif website_name == "中国网":
# 中国网搜索结果结构
for link in soup.find_all("a", href=True):
href = link["href"]
if "/article/" in href or "/opinion/" in href:
urls.append(href)
elif website_name == "中国政府网":
# 中国政府网搜索结果结构
for link in soup.find_all("a", href=True):
href = link["href"]
if "/zhengce/" in href or "/xinwen/" in href:
urls.append(href)
elif website_name == "求是网":
# 求是网搜索结果结构
for link in soup.find_all("a", href=True):
href = link["href"]
if "/article/" in href or "/content/" in href:
urls.append(href)
elif website_name == "解放军报":
# 解放军报搜索结果结构
for link in soup.find_all("a", href=True):
href = link["href"]
if "/zt/" in href or "/article/" in href:
urls.append(href)
# 去重并返回
return list(set(urls))
def crawl_by_keyword(keyword, website_names=None, max_pages=10, start_date=None, end_date=None, max_articles=100):
"""
根据关键词爬取多个网站的文章
Args:
keyword: 搜索关键词
website_names: 网站名称列表如果为None则爬取所有支持的网站
max_pages: 每个网站最大搜索页数
start_date: 开始日期 (YYYY-MM-DD)
end_date: 结束日期 (YYYY-MM-DD)
max_articles: 最大文章数量
Returns:
dict: 爬取结果统计
"""
if website_names is None:
website_names = list(WEBSITE_SEARCH_CONFIGS.keys())
results = {
"keyword": keyword,
"total_articles": 0,
"success_count": 0,
"failed_count": 0,
"website_results": {}
}
print(f"开始根据关键词 '{keyword}' 爬取文章...")
print(f"目标网站: {', '.join(website_names)}")
for website_name in website_names:
print(f"\n开始爬取 {website_name}...")
try:
# 获取或创建网站对象
from core.models import Website
website, created = Website.objects.get_or_create(
name=website_name,
defaults={
'base_url': WEBSITE_SEARCH_CONFIGS[website_name]["search_url"],
'enabled': True
}
)
# 搜索文章URL
article_urls = search_articles_by_keyword(
website_name, keyword, max_pages, start_date, end_date
)
if not article_urls:
print(f"{website_name} 没有找到相关文章")
results["website_results"][website_name] = {
"found_urls": 0,
"processed": 0,
"success": 0,
"failed": 0
}
continue
# 限制文章数量
if len(article_urls) > max_articles:
article_urls = article_urls[:max_articles]
print(f"{website_name} 找到 {len(article_urls)} 篇文章,开始处理...")
website_success = 0
website_failed = 0
for i, url in enumerate(article_urls, 1):
try:
print(f"处理第 {i}/{len(article_urls)} 篇: {url}")
process_article(url, website)
website_success += 1
results["success_count"] += 1
# 避免请求过快
time.sleep(0.5)
except Exception as e:
print(f"处理文章失败: {url}, 错误: {e}")
website_failed += 1
results["failed_count"] += 1
results["website_results"][website_name] = {
"found_urls": len(article_urls),
"processed": len(article_urls),
"success": website_success,
"failed": website_failed
}
print(f"{website_name} 完成: 成功 {website_success}, 失败 {website_failed}")
except Exception as e:
print(f"爬取 {website_name} 时出错: {e}")
results["website_results"][website_name] = {
"found_urls": 0,
"processed": 0,
"success": 0,
"failed": 1,
"error": str(e)
}
results["failed_count"] += 1
results["total_articles"] = results["success_count"] + results["failed_count"]
print(f"\n爬取完成!")
print(f"关键词: {keyword}")
print(f"总文章数: {results['total_articles']}")
print(f"成功: {results['success_count']}")
print(f"失败: {results['failed_count']}")
return results
def crawl_historical_articles(website_names=None, start_date=None, end_date=None, max_articles_per_site=50):
"""
爬取历史文章
Args:
website_names: 网站名称列表
start_date: 开始日期 (YYYY-MM-DD)
end_date: 结束日期 (YYYY-MM-DD)
max_articles_per_site: 每个网站最大文章数
Returns:
dict: 爬取结果统计
"""
if not start_date:
start_date = (datetime.now() - timedelta(days=30)).strftime("%Y-%m-%d")
if not end_date:
end_date = datetime.now().strftime("%Y-%m-%d")
if website_names is None:
website_names = list(WEBSITE_SEARCH_CONFIGS.keys())
results = {
"start_date": start_date,
"end_date": end_date,
"total_articles": 0,
"success_count": 0,
"failed_count": 0,
"website_results": {}
}
print(f"开始爬取历史文章...")
print(f"日期范围: {start_date}{end_date}")
print(f"目标网站: {', '.join(website_names)}")
# 使用通用关键词搜索历史文章
common_keywords = ["新闻", "报道", "文章", "资讯", "动态"]
for website_name in website_names:
print(f"\n开始爬取 {website_name} 历史文章...")
try:
from core.models import Website
website, created = Website.objects.get_or_create(
name=website_name,
defaults={
'base_url': WEBSITE_SEARCH_CONFIGS[website_name]["search_url"],
'enabled': True
}
)
website_success = 0
website_failed = 0
all_urls = set()
# 使用多个关键词搜索
for keyword in common_keywords:
try:
article_urls = search_articles_by_keyword(
website_name, keyword, max_pages=5,
start_date=start_date, end_date=end_date
)
all_urls.update(article_urls)
if len(all_urls) >= max_articles_per_site:
break
except Exception as e:
print(f"搜索关键词 '{keyword}' 时出错: {e}")
continue
# 限制文章数量
article_urls = list(all_urls)[:max_articles_per_site]
if not article_urls:
print(f"{website_name} 没有找到历史文章")
results["website_results"][website_name] = {
"found_urls": 0,
"processed": 0,
"success": 0,
"failed": 0
}
continue
print(f"{website_name} 找到 {len(article_urls)} 篇历史文章,开始处理...")
for i, url in enumerate(article_urls, 1):
try:
print(f"处理第 {i}/{len(article_urls)} 篇: {url}")
process_article(url, website)
website_success += 1
results["success_count"] += 1
time.sleep(0.5)
except Exception as e:
print(f"处理文章失败: {url}, 错误: {e}")
website_failed += 1
results["failed_count"] += 1
results["website_results"][website_name] = {
"found_urls": len(article_urls),
"processed": len(article_urls),
"success": website_success,
"failed": website_failed
}
print(f"{website_name} 完成: 成功 {website_success}, 失败 {website_failed}")
except Exception as e:
print(f"爬取 {website_name} 历史文章时出错: {e}")
results["website_results"][website_name] = {
"found_urls": 0,
"processed": 0,
"success": 0,
"failed": 1,
"error": str(e)
}
results["failed_count"] += 1
results["total_articles"] = results["success_count"] + results["failed_count"]
print(f"\n历史文章爬取完成!")
print(f"日期范围: {start_date}{end_date}")
print(f"总文章数: {results['total_articles']}")
print(f"成功: {results['success_count']}")
print(f"失败: {results['failed_count']}")
return results