Support keword
This commit is contained in:
@@ -16,9 +16,10 @@ from django.utils import timezone
|
||||
from django.db.models import Count, Q
|
||||
from django.core.cache import cache
|
||||
|
||||
from .models import Website, Article
|
||||
from .models import Website, Article, CrawlTask
|
||||
from .tasks import crawl_website, crawl_all_websites, cleanup_old_articles
|
||||
from .distributed_crawler import distributed_crawler
|
||||
from .task_executor import task_executor
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -411,6 +412,419 @@ class ArticleAdmin(admin.ModelAdmin):
|
||||
actions_column.short_description = '操作'
|
||||
|
||||
|
||||
class CrawlTaskStatusFilter(SimpleListFilter):
|
||||
"""爬取任务状态过滤器"""
|
||||
title = '任务状态'
|
||||
parameter_name = 'status'
|
||||
|
||||
def lookups(self, request, model_admin):
|
||||
return (
|
||||
('pending', '等待中'),
|
||||
('running', '运行中'),
|
||||
('completed', '已完成'),
|
||||
('failed', '失败'),
|
||||
('cancelled', '已取消'),
|
||||
)
|
||||
|
||||
def queryset(self, request, queryset):
|
||||
if self.value():
|
||||
return queryset.filter(status=self.value())
|
||||
return queryset
|
||||
|
||||
|
||||
class CrawlTaskTypeFilter(SimpleListFilter):
|
||||
"""爬取任务类型过滤器"""
|
||||
title = '任务类型'
|
||||
parameter_name = 'task_type'
|
||||
|
||||
def lookups(self, request, model_admin):
|
||||
return (
|
||||
('keyword', '关键词搜索'),
|
||||
('historical', '历史文章'),
|
||||
('full_site', '全站爬取'),
|
||||
)
|
||||
|
||||
def queryset(self, request, queryset):
|
||||
if self.value():
|
||||
return queryset.filter(task_type=self.value())
|
||||
return queryset
|
||||
|
||||
|
||||
class CrawlTaskAdmin(admin.ModelAdmin):
|
||||
"""爬取任务管理"""
|
||||
list_display = [
|
||||
'name', 'task_type', 'keyword', 'websites_display', 'status',
|
||||
'progress_display', 'created_at', 'duration_display', 'actions_column'
|
||||
]
|
||||
list_filter = [CrawlTaskStatusFilter, CrawlTaskTypeFilter, 'created_at']
|
||||
search_fields = ['name', 'keyword', 'created_by']
|
||||
readonly_fields = [
|
||||
'status', 'progress', 'current_website', 'current_action',
|
||||
'total_articles', 'success_count', 'failed_count',
|
||||
'created_at', 'started_at', 'completed_at', 'error_message',
|
||||
'result_details', 'duration_display', 'progress_display'
|
||||
]
|
||||
actions = ['start_tasks', 'cancel_tasks', 'delete_completed_tasks']
|
||||
|
||||
class Media:
|
||||
js = ('admin/js/crawl_task_actions.js',)
|
||||
|
||||
fieldsets = (
|
||||
('基本信息', {
|
||||
'fields': ('name', 'task_type', 'keyword')
|
||||
}),
|
||||
('爬取配置', {
|
||||
'fields': ('websites', 'start_date', 'end_date', 'max_pages', 'max_articles')
|
||||
}),
|
||||
('任务状态', {
|
||||
'fields': ('status', 'progress_display', 'current_website', 'current_action'),
|
||||
'classes': ('collapse',)
|
||||
}),
|
||||
('统计信息', {
|
||||
'fields': ('total_articles', 'success_count', 'failed_count'),
|
||||
'classes': ('collapse',)
|
||||
}),
|
||||
('时间信息', {
|
||||
'fields': ('created_at', 'started_at', 'completed_at', 'duration_display'),
|
||||
'classes': ('collapse',)
|
||||
}),
|
||||
('错误信息', {
|
||||
'fields': ('error_message',),
|
||||
'classes': ('collapse',)
|
||||
}),
|
||||
('结果详情', {
|
||||
'fields': ('result_details',),
|
||||
'classes': ('collapse',)
|
||||
}),
|
||||
)
|
||||
|
||||
def websites_display(self, obj):
|
||||
"""网站列表显示"""
|
||||
return obj.get_websites_display()
|
||||
websites_display.short_description = '目标网站'
|
||||
|
||||
def progress_display(self, obj):
|
||||
"""进度显示"""
|
||||
if obj.status == 'running':
|
||||
return format_html(
|
||||
'<div style="width: 100px; background-color: #f0f0f0; border-radius: 3px;">'
|
||||
'<div style="width: {}%; background-color: #4CAF50; height: 20px; border-radius: 3px; text-align: center; color: white; line-height: 20px;">{}%</div>'
|
||||
'</div>',
|
||||
obj.progress, obj.progress
|
||||
)
|
||||
elif obj.status == 'completed':
|
||||
return format_html('<span style="color: green;">✓ 完成</span>')
|
||||
elif obj.status == 'failed':
|
||||
return format_html('<span style="color: red;">✗ 失败</span>')
|
||||
elif obj.status == 'cancelled':
|
||||
return format_html('<span style="color: orange;">⊘ 已取消</span>')
|
||||
else:
|
||||
return format_html('<span style="color: gray;">⏳ 等待</span>')
|
||||
progress_display.short_description = '进度'
|
||||
|
||||
def duration_display(self, obj):
|
||||
"""执行时长显示"""
|
||||
duration = obj.get_duration()
|
||||
if duration:
|
||||
total_seconds = int(duration.total_seconds())
|
||||
hours = total_seconds // 3600
|
||||
minutes = (total_seconds % 3600) // 60
|
||||
seconds = total_seconds % 60
|
||||
if hours > 0:
|
||||
return f"{hours}小时{minutes}分钟"
|
||||
elif minutes > 0:
|
||||
return f"{minutes}分钟{seconds}秒"
|
||||
else:
|
||||
return f"{seconds}秒"
|
||||
return "-"
|
||||
duration_display.short_description = '执行时长'
|
||||
|
||||
def actions_column(self, obj):
|
||||
"""操作列"""
|
||||
actions = []
|
||||
|
||||
if obj.status == 'pending':
|
||||
actions.append(f'<a href="javascript:void(0)" onclick="startTask({obj.id})" class="button">开始</a>')
|
||||
|
||||
if obj.can_cancel():
|
||||
actions.append(f'<a href="javascript:void(0)" onclick="cancelTask({obj.id})" class="button">取消</a>')
|
||||
|
||||
if obj.status == 'completed':
|
||||
actions.append(f'<a href="javascript:void(0)" onclick="viewResults({obj.id})" class="button">查看结果</a>')
|
||||
|
||||
return format_html(' '.join(actions))
|
||||
actions_column.short_description = '操作'
|
||||
|
||||
def start_tasks(self, request, queryset):
|
||||
"""启动选中的任务"""
|
||||
started_count = 0
|
||||
for task in queryset.filter(status='pending'):
|
||||
try:
|
||||
success, message = task_executor.start_task(task.id)
|
||||
if success:
|
||||
started_count += 1
|
||||
else:
|
||||
self.message_user(request, f'启动任务 {task.name} 失败: {message}', messages.ERROR)
|
||||
except Exception as e:
|
||||
self.message_user(request, f'启动任务 {task.name} 失败: {e}', messages.ERROR)
|
||||
|
||||
if started_count > 0:
|
||||
self.message_user(request, f'成功启动 {started_count} 个任务', messages.SUCCESS)
|
||||
start_tasks.short_description = '启动选中的任务'
|
||||
|
||||
def cancel_tasks(self, request, queryset):
|
||||
"""取消选中的任务"""
|
||||
cancelled_count = 0
|
||||
for task in queryset.filter(status__in=['pending', 'running']):
|
||||
try:
|
||||
success, message = task_executor.cancel_task(task.id)
|
||||
if success:
|
||||
cancelled_count += 1
|
||||
else:
|
||||
self.message_user(request, f'取消任务 {task.name} 失败: {message}', messages.ERROR)
|
||||
except Exception as e:
|
||||
self.message_user(request, f'取消任务 {task.name} 失败: {e}', messages.ERROR)
|
||||
|
||||
if cancelled_count > 0:
|
||||
self.message_user(request, f'成功取消 {cancelled_count} 个任务', messages.SUCCESS)
|
||||
cancel_tasks.short_description = '取消选中的任务'
|
||||
|
||||
def delete_completed_tasks(self, request, queryset):
|
||||
"""删除已完成的任务"""
|
||||
completed_tasks = queryset.filter(status__in=['completed', 'failed', 'cancelled'])
|
||||
count = completed_tasks.count()
|
||||
completed_tasks.delete()
|
||||
|
||||
if count > 0:
|
||||
self.message_user(request, f'成功删除 {count} 个已完成的任务', messages.SUCCESS)
|
||||
delete_completed_tasks.short_description = '删除已完成的任务'
|
||||
|
||||
def get_urls(self):
|
||||
"""添加自定义URL"""
|
||||
urls = super().get_urls()
|
||||
custom_urls = [
|
||||
path(
|
||||
'create-keyword-task/',
|
||||
self.admin_site.admin_view(self.create_keyword_task_view),
|
||||
name='create_keyword_task',
|
||||
),
|
||||
path(
|
||||
'create-historical-task/',
|
||||
self.admin_site.admin_view(self.create_historical_task_view),
|
||||
name='create_historical_task',
|
||||
),
|
||||
path(
|
||||
'create-full-site-task/',
|
||||
self.admin_site.admin_view(self.create_full_site_task_view),
|
||||
name='create_full_site_task',
|
||||
),
|
||||
path(
|
||||
'<int:task_id>/start/',
|
||||
self.admin_site.admin_view(self.start_task_view),
|
||||
name='start_task',
|
||||
),
|
||||
path(
|
||||
'<int:task_id>/cancel/',
|
||||
self.admin_site.admin_view(self.cancel_task_view),
|
||||
name='cancel_task',
|
||||
),
|
||||
path(
|
||||
'<int:task_id>/results/',
|
||||
self.admin_site.admin_view(self.view_results_view),
|
||||
name='view_results',
|
||||
),
|
||||
]
|
||||
return custom_urls + urls
|
||||
|
||||
def create_keyword_task_view(self, request):
|
||||
"""创建关键词搜索任务视图"""
|
||||
if request.method == 'POST':
|
||||
try:
|
||||
from .utils import WEBSITE_SEARCH_CONFIGS
|
||||
|
||||
name = request.POST.get('name', '')
|
||||
keyword = request.POST.get('keyword', '')
|
||||
websites = request.POST.getlist('websites')
|
||||
start_date = request.POST.get('start_date')
|
||||
end_date = request.POST.get('end_date')
|
||||
max_pages = int(request.POST.get('max_pages', 10))
|
||||
max_articles = int(request.POST.get('max_articles', 100))
|
||||
|
||||
if not name or not keyword:
|
||||
self.message_user(request, '任务名称和关键词不能为空', messages.ERROR)
|
||||
return HttpResponseRedirect(reverse('admin:core_crawltask_changelist'))
|
||||
|
||||
# 创建任务
|
||||
task = CrawlTask.objects.create(
|
||||
name=name,
|
||||
task_type='keyword',
|
||||
keyword=keyword,
|
||||
start_date=start_date if start_date else None,
|
||||
end_date=end_date if end_date else None,
|
||||
max_pages=max_pages,
|
||||
max_articles=max_articles,
|
||||
created_by=request.user.username if request.user.is_authenticated else 'admin'
|
||||
)
|
||||
|
||||
# 添加选择的网站
|
||||
if websites:
|
||||
website_objects = Website.objects.filter(name__in=websites)
|
||||
task.websites.set(website_objects)
|
||||
|
||||
self.message_user(request, f'关键词搜索任务 "{name}" 创建成功', messages.SUCCESS)
|
||||
return HttpResponseRedirect(reverse('admin:core_crawltask_change', args=[task.id]))
|
||||
|
||||
except Exception as e:
|
||||
self.message_user(request, f'创建任务失败: {e}', messages.ERROR)
|
||||
|
||||
# GET请求,显示创建表单
|
||||
context = {
|
||||
'websites': Website.objects.filter(enabled=True),
|
||||
'title': '创建关键词搜索任务'
|
||||
}
|
||||
return admin.site.admin_view(self.render_create_task_template)(request, 'admin/create_keyword_task.html', context)
|
||||
|
||||
def create_historical_task_view(self, request):
|
||||
"""创建历史文章任务视图"""
|
||||
if request.method == 'POST':
|
||||
try:
|
||||
from .utils import WEBSITE_SEARCH_CONFIGS
|
||||
|
||||
name = request.POST.get('name', '')
|
||||
websites = request.POST.getlist('websites')
|
||||
start_date = request.POST.get('start_date')
|
||||
end_date = request.POST.get('end_date')
|
||||
max_articles = int(request.POST.get('max_articles', 50))
|
||||
|
||||
if not name:
|
||||
self.message_user(request, '任务名称不能为空', messages.ERROR)
|
||||
return HttpResponseRedirect(reverse('admin:core_crawltask_changelist'))
|
||||
|
||||
# 创建任务
|
||||
task = CrawlTask.objects.create(
|
||||
name=name,
|
||||
task_type='historical',
|
||||
keyword='历史文章',
|
||||
start_date=start_date if start_date else None,
|
||||
end_date=end_date if end_date else None,
|
||||
max_articles=max_articles,
|
||||
created_by=request.user.username if request.user.is_authenticated else 'admin'
|
||||
)
|
||||
|
||||
# 添加选择的网站
|
||||
if websites:
|
||||
website_objects = Website.objects.filter(name__in=websites)
|
||||
task.websites.set(website_objects)
|
||||
|
||||
self.message_user(request, f'历史文章任务 "{name}" 创建成功', messages.SUCCESS)
|
||||
return HttpResponseRedirect(reverse('admin:core_crawltask_change', args=[task.id]))
|
||||
|
||||
except Exception as e:
|
||||
self.message_user(request, f'创建任务失败: {e}', messages.ERROR)
|
||||
|
||||
# GET请求,显示创建表单
|
||||
context = {
|
||||
'websites': Website.objects.filter(enabled=True),
|
||||
'title': '创建历史文章任务'
|
||||
}
|
||||
return admin.site.admin_view(self.render_create_task_template)(request, 'admin/create_historical_task.html', context)
|
||||
|
||||
def create_full_site_task_view(self, request):
|
||||
"""创建全站爬取任务视图"""
|
||||
if request.method == 'POST':
|
||||
try:
|
||||
from .utils import WEBSITE_SEARCH_CONFIGS
|
||||
|
||||
name = request.POST.get('name', '')
|
||||
websites = request.POST.getlist('websites')
|
||||
max_pages = int(request.POST.get('max_pages', 500))
|
||||
|
||||
if not name:
|
||||
self.message_user(request, '任务名称不能为空', messages.ERROR)
|
||||
return HttpResponseRedirect(reverse('admin:core_crawltask_changelist'))
|
||||
|
||||
# 创建任务
|
||||
task = CrawlTask.objects.create(
|
||||
name=name,
|
||||
task_type='full_site',
|
||||
keyword='全站爬取',
|
||||
max_pages=max_pages,
|
||||
created_by=request.user.username if request.user.is_authenticated else 'admin'
|
||||
)
|
||||
|
||||
# 添加选择的网站
|
||||
if websites:
|
||||
website_objects = Website.objects.filter(name__in=websites)
|
||||
task.websites.set(website_objects)
|
||||
|
||||
self.message_user(request, f'全站爬取任务 "{name}" 创建成功', messages.SUCCESS)
|
||||
return HttpResponseRedirect(reverse('admin:core_crawltask_change', args=[task.id]))
|
||||
|
||||
except Exception as e:
|
||||
self.message_user(request, f'创建任务失败: {e}', messages.ERROR)
|
||||
|
||||
# GET请求,显示创建表单
|
||||
context = {
|
||||
'websites': Website.objects.filter(enabled=True),
|
||||
'title': '创建全站爬取任务'
|
||||
}
|
||||
return admin.site.admin_view(self.render_create_task_template)(request, 'admin/create_full_site_task.html', context)
|
||||
|
||||
def start_task_view(self, request, task_id):
|
||||
"""启动任务视图"""
|
||||
try:
|
||||
success, message = task_executor.start_task(task_id)
|
||||
if success:
|
||||
self.message_user(request, f'任务已启动: {message}', messages.SUCCESS)
|
||||
else:
|
||||
self.message_user(request, f'启动任务失败: {message}', messages.ERROR)
|
||||
except Exception as e:
|
||||
self.message_user(request, f'启动任务失败: {e}', messages.ERROR)
|
||||
|
||||
return HttpResponseRedirect(reverse('admin:core_crawltask_changelist'))
|
||||
|
||||
def cancel_task_view(self, request, task_id):
|
||||
"""取消任务视图"""
|
||||
try:
|
||||
success, message = task_executor.cancel_task(task_id)
|
||||
if success:
|
||||
self.message_user(request, f'任务已取消: {message}', messages.SUCCESS)
|
||||
else:
|
||||
self.message_user(request, f'取消任务失败: {message}', messages.ERROR)
|
||||
except Exception as e:
|
||||
self.message_user(request, f'取消任务失败: {e}', messages.ERROR)
|
||||
|
||||
return HttpResponseRedirect(reverse('admin:core_crawltask_changelist'))
|
||||
|
||||
def view_results_view(self, request, task_id):
|
||||
"""查看结果视图"""
|
||||
try:
|
||||
task = CrawlTask.objects.get(id=task_id)
|
||||
context = {
|
||||
'task': task,
|
||||
'title': f'任务结果 - {task.name}'
|
||||
}
|
||||
return admin.site.admin_view(self.render_create_task_template)(request, 'admin/task_results.html', context)
|
||||
except CrawlTask.DoesNotExist:
|
||||
self.message_user(request, '任务不存在', messages.ERROR)
|
||||
return HttpResponseRedirect(reverse('admin:core_crawltask_changelist'))
|
||||
|
||||
def render_create_task_template(self, request, template_name, context):
|
||||
"""渲染创建任务模板"""
|
||||
from django.template.loader import render_to_string
|
||||
from django.http import HttpResponse
|
||||
|
||||
context.update({
|
||||
'site_header': admin.site.site_header,
|
||||
'site_title': admin.site.site_title,
|
||||
'has_permission': True,
|
||||
'user': request.user,
|
||||
})
|
||||
|
||||
html = render_to_string(template_name, context)
|
||||
return HttpResponse(html)
|
||||
|
||||
|
||||
#class CrawlerStatusAdmin(admin.ModelAdmin):
|
||||
# """爬虫状态管理"""
|
||||
# change_list_template = 'admin/crawler_status.html'
|
||||
@@ -448,6 +862,7 @@ class ArticleAdmin(admin.ModelAdmin):
|
||||
# 注册管理类
|
||||
admin.site.register(Website, WebsiteAdmin)
|
||||
admin.site.register(Article, ArticleAdmin)
|
||||
admin.site.register(CrawlTask, CrawlTaskAdmin)
|
||||
|
||||
|
||||
# 隐藏Celery Results管理功能
|
||||
|
||||
257
core/management/commands/crawl_all_websites.py
Normal file
257
core/management/commands/crawl_all_websites.py
Normal file
@@ -0,0 +1,257 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
from core.utils import full_site_crawler, crawl_by_keyword, WEBSITE_SEARCH_CONFIGS
|
||||
from core.models import Website
|
||||
import json
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "一键爬取所有支持的网站"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument(
|
||||
'--mode', '-m',
|
||||
type=str,
|
||||
choices=['full', 'keyword', 'both'],
|
||||
default='full',
|
||||
help='爬取模式: full(全站爬取), keyword(关键词爬取), both(两种模式)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--keyword', '-k',
|
||||
type=str,
|
||||
help='关键词搜索模式下的搜索关键词'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--websites', '-w',
|
||||
type=str,
|
||||
nargs='*',
|
||||
help='指定要爬取的网站名称列表,如果不指定则爬取所有支持的网站'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--max-pages', '-p',
|
||||
type=int,
|
||||
default=500,
|
||||
help='全站爬取最大页数 (默认: 500)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--max-search-pages', '-sp',
|
||||
type=int,
|
||||
default=10,
|
||||
help='关键词搜索最大页数 (默认: 10)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--max-articles', '-a',
|
||||
type=int,
|
||||
default=100,
|
||||
help='关键词搜索最大文章数量 (默认: 100)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--start-date', '-s',
|
||||
type=str,
|
||||
help='开始日期 (格式: YYYY-MM-DD)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--end-date', '-e',
|
||||
type=str,
|
||||
help='结束日期 (格式: YYYY-MM-DD)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--list-websites', '-l',
|
||||
action='store_true',
|
||||
help='列出所有支持的网站'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--output', '-o',
|
||||
type=str,
|
||||
help='将结果保存到JSON文件'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--skip-existing',
|
||||
action='store_true',
|
||||
help='跳过已存在的网站配置'
|
||||
)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
# 列出支持的网站
|
||||
if options['list_websites']:
|
||||
self.stdout.write(self.style.SUCCESS("支持的网站列表:"))
|
||||
for i, website in enumerate(WEBSITE_SEARCH_CONFIGS.keys(), 1):
|
||||
self.stdout.write(f"{i:2d}. {website}")
|
||||
return
|
||||
|
||||
mode = options['mode']
|
||||
keyword = options['keyword']
|
||||
websites = options['websites']
|
||||
max_pages = options['max_pages']
|
||||
max_search_pages = options['max_search_pages']
|
||||
max_articles = options['max_articles']
|
||||
start_date = options['start_date']
|
||||
end_date = options['end_date']
|
||||
output_file = options['output']
|
||||
skip_existing = options['skip_existing']
|
||||
|
||||
# 验证网站名称
|
||||
if websites:
|
||||
invalid_websites = [w for w in websites if w not in WEBSITE_SEARCH_CONFIGS]
|
||||
if invalid_websites:
|
||||
self.stdout.write(
|
||||
self.style.ERROR(f"不支持的网站: {', '.join(invalid_websites)}")
|
||||
)
|
||||
self.stdout.write("使用 --list-websites 查看支持的网站列表")
|
||||
return
|
||||
|
||||
# 确定要爬取的网站列表
|
||||
target_websites = websites if websites else list(WEBSITE_SEARCH_CONFIGS.keys())
|
||||
|
||||
# 验证关键词模式
|
||||
if mode in ['keyword', 'both'] and not keyword:
|
||||
self.stdout.write(
|
||||
self.style.ERROR("关键词模式需要指定 --keyword 参数")
|
||||
)
|
||||
return
|
||||
|
||||
self.stdout.write(f"开始一键爬取任务...")
|
||||
self.stdout.write(f"爬取模式: {mode}")
|
||||
self.stdout.write(f"目标网站: {', '.join(target_websites)}")
|
||||
if keyword:
|
||||
self.stdout.write(f"关键词: {keyword}")
|
||||
if start_date:
|
||||
self.stdout.write(f"开始日期: {start_date}")
|
||||
if end_date:
|
||||
self.stdout.write(f"结束日期: {end_date}")
|
||||
|
||||
all_results = {
|
||||
"mode": mode,
|
||||
"websites": target_websites,
|
||||
"keyword": keyword,
|
||||
"start_date": start_date,
|
||||
"end_date": end_date,
|
||||
"full_crawl_results": {},
|
||||
"keyword_crawl_results": {},
|
||||
"summary": {
|
||||
"total_websites": len(target_websites),
|
||||
"full_crawl_success": 0,
|
||||
"full_crawl_failed": 0,
|
||||
"keyword_crawl_success": 0,
|
||||
"keyword_crawl_failed": 0
|
||||
}
|
||||
}
|
||||
|
||||
try:
|
||||
for website_name in target_websites:
|
||||
self.stdout.write(f"\n{'='*50}")
|
||||
self.stdout.write(f"开始处理网站: {website_name}")
|
||||
self.stdout.write(f"{'='*50}")
|
||||
|
||||
# 获取或创建网站对象
|
||||
website, created = Website.objects.get_or_create(
|
||||
name=website_name,
|
||||
defaults={
|
||||
'base_url': WEBSITE_SEARCH_CONFIGS[website_name]["search_url"],
|
||||
'enabled': True
|
||||
}
|
||||
)
|
||||
|
||||
if not created and skip_existing:
|
||||
self.stdout.write(f"跳过已存在的网站: {website_name}")
|
||||
continue
|
||||
|
||||
website_results = {
|
||||
"full_crawl": None,
|
||||
"keyword_crawl": None
|
||||
}
|
||||
|
||||
# 全站爬取
|
||||
if mode in ['full', 'both']:
|
||||
self.stdout.write(f"\n开始全站爬取: {website_name}")
|
||||
try:
|
||||
full_site_crawler(
|
||||
WEBSITE_SEARCH_CONFIGS[website_name]["search_url"],
|
||||
website,
|
||||
max_pages=max_pages
|
||||
)
|
||||
self.stdout.write(self.style.SUCCESS(f"全站爬取完成: {website_name}"))
|
||||
website_results["full_crawl"] = {"status": "success"}
|
||||
all_results["summary"]["full_crawl_success"] += 1
|
||||
except Exception as e:
|
||||
self.stdout.write(self.style.ERROR(f"全站爬取失败: {website_name}, 错误: {e}"))
|
||||
website_results["full_crawl"] = {"status": "failed", "error": str(e)}
|
||||
all_results["summary"]["full_crawl_failed"] += 1
|
||||
|
||||
# 关键词爬取
|
||||
if mode in ['keyword', 'both']:
|
||||
self.stdout.write(f"\n开始关键词爬取: {website_name}")
|
||||
try:
|
||||
keyword_results = crawl_by_keyword(
|
||||
keyword=keyword,
|
||||
website_names=[website_name],
|
||||
max_pages=max_search_pages,
|
||||
start_date=start_date,
|
||||
end_date=end_date,
|
||||
max_articles=max_articles
|
||||
)
|
||||
website_results["keyword_crawl"] = keyword_results
|
||||
if keyword_results["success_count"] > 0:
|
||||
all_results["summary"]["keyword_crawl_success"] += 1
|
||||
else:
|
||||
all_results["summary"]["keyword_crawl_failed"] += 1
|
||||
except Exception as e:
|
||||
self.stdout.write(self.style.ERROR(f"关键词爬取失败: {website_name}, 错误: {e}"))
|
||||
website_results["keyword_crawl"] = {"status": "failed", "error": str(e)}
|
||||
all_results["summary"]["keyword_crawl_failed"] += 1
|
||||
|
||||
all_results["full_crawl_results"][website_name] = website_results["full_crawl"]
|
||||
all_results["keyword_crawl_results"][website_name] = website_results["keyword_crawl"]
|
||||
|
||||
# 显示最终结果摘要
|
||||
self.stdout.write(f"\n{'='*50}")
|
||||
self.stdout.write(self.style.SUCCESS("一键爬取完成!"))
|
||||
self.stdout.write(f"{'='*50}")
|
||||
self.stdout.write(f"总网站数: {all_results['summary']['total_websites']}")
|
||||
|
||||
if mode in ['full', 'both']:
|
||||
self.stdout.write(f"全站爬取 - 成功: {all_results['summary']['full_crawl_success']}, "
|
||||
f"失败: {all_results['summary']['full_crawl_failed']}")
|
||||
|
||||
if mode in ['keyword', 'both']:
|
||||
self.stdout.write(f"关键词爬取 - 成功: {all_results['summary']['keyword_crawl_success']}, "
|
||||
f"失败: {all_results['summary']['keyword_crawl_failed']}")
|
||||
|
||||
# 显示各网站详细结果
|
||||
self.stdout.write("\n各网站详细结果:")
|
||||
for website_name in target_websites:
|
||||
self.stdout.write(f"\n{website_name}:")
|
||||
|
||||
if mode in ['full', 'both']:
|
||||
full_result = all_results["full_crawl_results"][website_name]
|
||||
if full_result and full_result.get("status") == "success":
|
||||
self.stdout.write(self.style.SUCCESS(" 全站爬取: 成功"))
|
||||
elif full_result:
|
||||
self.stdout.write(self.style.ERROR(f" 全站爬取: 失败 - {full_result.get('error', '未知错误')}"))
|
||||
|
||||
if mode in ['keyword', 'both']:
|
||||
keyword_result = all_results["keyword_crawl_results"][website_name]
|
||||
if keyword_result and "success_count" in keyword_result:
|
||||
self.stdout.write(f" 关键词爬取: 成功 {keyword_result['success_count']} 篇, "
|
||||
f"失败 {keyword_result['failed_count']} 篇")
|
||||
elif keyword_result and keyword_result.get("status") == "failed":
|
||||
self.stdout.write(self.style.ERROR(f" 关键词爬取: 失败 - {keyword_result.get('error', '未知错误')}"))
|
||||
|
||||
# 保存结果到文件
|
||||
if output_file:
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(all_results, f, ensure_ascii=False, indent=2)
|
||||
self.stdout.write(f"\n结果已保存到: {output_file}")
|
||||
|
||||
except Exception as e:
|
||||
self.stdout.write(self.style.ERROR(f"一键爬取过程中出现错误: {e}"))
|
||||
raise
|
||||
157
core/management/commands/crawl_by_keyword.py
Normal file
157
core/management/commands/crawl_by_keyword.py
Normal file
@@ -0,0 +1,157 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
from core.utils import crawl_by_keyword, crawl_historical_articles, WEBSITE_SEARCH_CONFIGS
|
||||
import json
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "根据关键词爬取多个网站的文章"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument(
|
||||
'--keyword', '-k',
|
||||
type=str,
|
||||
help='搜索关键词'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--websites', '-w',
|
||||
type=str,
|
||||
nargs='*',
|
||||
help='指定要爬取的网站名称列表,如果不指定则爬取所有支持的网站'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--max-pages', '-p',
|
||||
type=int,
|
||||
default=10,
|
||||
help='每个网站最大搜索页数 (默认: 10)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--max-articles', '-a',
|
||||
type=int,
|
||||
default=100,
|
||||
help='最大文章数量 (默认: 100)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--start-date', '-s',
|
||||
type=str,
|
||||
help='开始日期 (格式: YYYY-MM-DD)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--end-date', '-e',
|
||||
type=str,
|
||||
help='结束日期 (格式: YYYY-MM-DD)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--historical',
|
||||
action='store_true',
|
||||
help='爬取历史文章模式'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--list-websites', '-l',
|
||||
action='store_true',
|
||||
help='列出所有支持的网站'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--output', '-o',
|
||||
type=str,
|
||||
help='将结果保存到JSON文件'
|
||||
)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
# 列出支持的网站
|
||||
if options['list_websites']:
|
||||
self.stdout.write(self.style.SUCCESS("支持的网站列表:"))
|
||||
for i, website in enumerate(WEBSITE_SEARCH_CONFIGS.keys(), 1):
|
||||
self.stdout.write(f"{i:2d}. {website}")
|
||||
return
|
||||
|
||||
keyword = options['keyword']
|
||||
if not keyword:
|
||||
self.stdout.write(self.style.ERROR("必须指定 --keyword 参数"))
|
||||
return
|
||||
websites = options['websites']
|
||||
max_pages = options['max_pages']
|
||||
max_articles = options['max_articles']
|
||||
start_date = options['start_date']
|
||||
end_date = options['end_date']
|
||||
historical = options['historical']
|
||||
output_file = options['output']
|
||||
|
||||
# 验证网站名称
|
||||
if websites:
|
||||
invalid_websites = [w for w in websites if w not in WEBSITE_SEARCH_CONFIGS]
|
||||
if invalid_websites:
|
||||
self.stdout.write(
|
||||
self.style.ERROR(f"不支持的网站: {', '.join(invalid_websites)}")
|
||||
)
|
||||
self.stdout.write("使用 --list-websites 查看支持的网站列表")
|
||||
return
|
||||
|
||||
self.stdout.write(f"开始爬取任务...")
|
||||
self.stdout.write(f"关键词: {keyword}")
|
||||
if websites:
|
||||
self.stdout.write(f"目标网站: {', '.join(websites)}")
|
||||
else:
|
||||
self.stdout.write(f"目标网站: 所有支持的网站 ({len(WEBSITE_SEARCH_CONFIGS)}个)")
|
||||
|
||||
if start_date:
|
||||
self.stdout.write(f"开始日期: {start_date}")
|
||||
if end_date:
|
||||
self.stdout.write(f"结束日期: {end_date}")
|
||||
self.stdout.write(f"最大页数: {max_pages}")
|
||||
self.stdout.write(f"最大文章数: {max_articles}")
|
||||
|
||||
try:
|
||||
if historical:
|
||||
# 历史文章爬取模式
|
||||
self.stdout.write(self.style.WARNING("使用历史文章爬取模式"))
|
||||
results = crawl_historical_articles(
|
||||
website_names=websites,
|
||||
start_date=start_date,
|
||||
end_date=end_date,
|
||||
max_articles_per_site=max_articles
|
||||
)
|
||||
else:
|
||||
# 关键词搜索模式
|
||||
results = crawl_by_keyword(
|
||||
keyword=keyword,
|
||||
website_names=websites,
|
||||
max_pages=max_pages,
|
||||
start_date=start_date,
|
||||
end_date=end_date,
|
||||
max_articles=max_articles
|
||||
)
|
||||
|
||||
# 显示结果摘要
|
||||
self.stdout.write(self.style.SUCCESS("\n爬取完成!"))
|
||||
self.stdout.write(f"总文章数: {results['total_articles']}")
|
||||
self.stdout.write(f"成功: {results['success_count']}")
|
||||
self.stdout.write(f"失败: {results['failed_count']}")
|
||||
|
||||
# 显示各网站详细结果
|
||||
self.stdout.write("\n各网站结果:")
|
||||
for website, result in results['website_results'].items():
|
||||
status = self.style.SUCCESS if result['success'] > 0 else self.style.WARNING
|
||||
self.stdout.write(
|
||||
status(f" {website}: 找到 {result['found_urls']} 篇, "
|
||||
f"成功 {result['success']}, 失败 {result['failed']}")
|
||||
)
|
||||
if 'error' in result:
|
||||
self.stdout.write(self.style.ERROR(f" 错误: {result['error']}"))
|
||||
|
||||
# 保存结果到文件
|
||||
if output_file:
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(results, f, ensure_ascii=False, indent=2)
|
||||
self.stdout.write(f"\n结果已保存到: {output_file}")
|
||||
|
||||
except Exception as e:
|
||||
self.stdout.write(self.style.ERROR(f"爬取过程中出现错误: {e}"))
|
||||
raise
|
||||
45
core/migrations/0002_crawltask.py
Normal file
45
core/migrations/0002_crawltask.py
Normal file
@@ -0,0 +1,45 @@
|
||||
# Generated by Django 5.1 on 2025-09-23 19:28
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0001_initial'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='CrawlTask',
|
||||
fields=[
|
||||
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('name', models.CharField(max_length=200, verbose_name='任务名称')),
|
||||
('task_type', models.CharField(choices=[('keyword', '关键词搜索'), ('historical', '历史文章'), ('full_site', '全站爬取')], default='keyword', max_length=20, verbose_name='任务类型')),
|
||||
('keyword', models.CharField(blank=True, max_length=200, null=True, verbose_name='搜索关键词')),
|
||||
('websites', models.JSONField(default=list, verbose_name='目标网站')),
|
||||
('start_date', models.DateField(blank=True, null=True, verbose_name='开始日期')),
|
||||
('end_date', models.DateField(blank=True, null=True, verbose_name='结束日期')),
|
||||
('max_pages', models.IntegerField(default=10, verbose_name='最大页数')),
|
||||
('max_articles', models.IntegerField(default=100, verbose_name='最大文章数')),
|
||||
('status', models.CharField(choices=[('pending', '等待中'), ('running', '运行中'), ('completed', '已完成'), ('failed', '失败'), ('cancelled', '已取消')], default='pending', max_length=20, verbose_name='状态')),
|
||||
('progress', models.IntegerField(default=0, verbose_name='进度百分比')),
|
||||
('current_website', models.CharField(blank=True, max_length=100, null=True, verbose_name='当前网站')),
|
||||
('current_action', models.CharField(blank=True, max_length=200, null=True, verbose_name='当前操作')),
|
||||
('total_articles', models.IntegerField(default=0, verbose_name='总文章数')),
|
||||
('success_count', models.IntegerField(default=0, verbose_name='成功数')),
|
||||
('failed_count', models.IntegerField(default=0, verbose_name='失败数')),
|
||||
('created_at', models.DateTimeField(auto_now_add=True, verbose_name='创建时间')),
|
||||
('started_at', models.DateTimeField(blank=True, null=True, verbose_name='开始时间')),
|
||||
('completed_at', models.DateTimeField(blank=True, null=True, verbose_name='完成时间')),
|
||||
('error_message', models.TextField(blank=True, null=True, verbose_name='错误信息')),
|
||||
('result_details', models.JSONField(blank=True, default=dict, verbose_name='结果详情')),
|
||||
('created_by', models.CharField(blank=True, max_length=100, null=True, verbose_name='创建者')),
|
||||
],
|
||||
options={
|
||||
'verbose_name': '爬取任务',
|
||||
'verbose_name_plural': '爬取任务',
|
||||
'ordering': ['-created_at'],
|
||||
},
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,22 @@
|
||||
# Generated by Django 5.1 on 2025-09-23 19:34
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0002_crawltask'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RemoveField(
|
||||
model_name='crawltask',
|
||||
name='websites',
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='crawltask',
|
||||
name='websites',
|
||||
field=models.ManyToManyField(blank=True, to='core.website', verbose_name='目标网站'),
|
||||
),
|
||||
]
|
||||
@@ -1,4 +1,6 @@
|
||||
from django.db import models
|
||||
from django.utils import timezone
|
||||
import json
|
||||
|
||||
|
||||
class Website(models.Model):
|
||||
@@ -25,3 +27,93 @@ class Article(models.Model):
|
||||
|
||||
def __str__(self):
|
||||
return self.title
|
||||
|
||||
|
||||
class CrawlTask(models.Model):
|
||||
"""爬取任务模型"""
|
||||
TASK_STATUS_CHOICES = [
|
||||
('pending', '等待中'),
|
||||
('running', '运行中'),
|
||||
('completed', '已完成'),
|
||||
('failed', '失败'),
|
||||
('cancelled', '已取消'),
|
||||
]
|
||||
|
||||
TASK_TYPE_CHOICES = [
|
||||
('keyword', '关键词搜索'),
|
||||
('historical', '历史文章'),
|
||||
('full_site', '全站爬取'),
|
||||
]
|
||||
|
||||
name = models.CharField(max_length=200, verbose_name="任务名称")
|
||||
task_type = models.CharField(max_length=20, choices=TASK_TYPE_CHOICES, default='keyword', verbose_name="任务类型")
|
||||
keyword = models.CharField(max_length=200, blank=True, null=True, verbose_name="搜索关键词")
|
||||
websites = models.ManyToManyField(Website, blank=True, verbose_name="目标网站")
|
||||
start_date = models.DateField(blank=True, null=True, verbose_name="开始日期")
|
||||
end_date = models.DateField(blank=True, null=True, verbose_name="结束日期")
|
||||
max_pages = models.IntegerField(default=10, verbose_name="最大页数")
|
||||
max_articles = models.IntegerField(default=100, verbose_name="最大文章数")
|
||||
|
||||
status = models.CharField(max_length=20, choices=TASK_STATUS_CHOICES, default='pending', verbose_name="状态")
|
||||
progress = models.IntegerField(default=0, verbose_name="进度百分比")
|
||||
current_website = models.CharField(max_length=100, blank=True, null=True, verbose_name="当前网站")
|
||||
current_action = models.CharField(max_length=200, blank=True, null=True, verbose_name="当前操作")
|
||||
|
||||
total_articles = models.IntegerField(default=0, verbose_name="总文章数")
|
||||
success_count = models.IntegerField(default=0, verbose_name="成功数")
|
||||
failed_count = models.IntegerField(default=0, verbose_name="失败数")
|
||||
|
||||
created_at = models.DateTimeField(auto_now_add=True, verbose_name="创建时间")
|
||||
started_at = models.DateTimeField(blank=True, null=True, verbose_name="开始时间")
|
||||
completed_at = models.DateTimeField(blank=True, null=True, verbose_name="完成时间")
|
||||
|
||||
error_message = models.TextField(blank=True, null=True, verbose_name="错误信息")
|
||||
result_details = models.JSONField(default=dict, blank=True, verbose_name="结果详情")
|
||||
|
||||
created_by = models.CharField(max_length=100, blank=True, null=True, verbose_name="创建者")
|
||||
|
||||
class Meta:
|
||||
verbose_name = "爬取任务"
|
||||
verbose_name_plural = "爬取任务"
|
||||
ordering = ['-created_at']
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.name} ({self.get_status_display()})"
|
||||
|
||||
def get_websites_display(self):
|
||||
"""获取网站列表的显示文本"""
|
||||
websites = self.websites.all()
|
||||
if not websites:
|
||||
return "所有网站"
|
||||
return ", ".join([w.name for w in websites])
|
||||
|
||||
def get_duration(self):
|
||||
"""获取任务执行时长"""
|
||||
if not self.started_at:
|
||||
return None
|
||||
end_time = self.completed_at or timezone.now()
|
||||
return end_time - self.started_at
|
||||
|
||||
def is_running(self):
|
||||
"""判断任务是否正在运行"""
|
||||
return self.status == 'running'
|
||||
|
||||
def can_cancel(self):
|
||||
"""判断任务是否可以取消"""
|
||||
return self.status in ['pending', 'running']
|
||||
|
||||
def get_progress_display(self):
|
||||
"""获取进度显示文本"""
|
||||
if self.status == 'pending':
|
||||
return "等待开始"
|
||||
elif self.status == 'running':
|
||||
if self.current_website and self.current_action:
|
||||
return f"正在处理 {self.current_website}: {self.current_action}"
|
||||
return f"运行中 ({self.progress}%)"
|
||||
elif self.status == 'completed':
|
||||
return f"已完成 ({self.success_count}/{self.total_articles})"
|
||||
elif self.status == 'failed':
|
||||
return f"失败: {self.error_message[:50]}..." if self.error_message else "失败"
|
||||
elif self.status == 'cancelled':
|
||||
return "已取消"
|
||||
return "未知状态"
|
||||
84
core/static/admin/js/crawl_task_actions.js
Normal file
84
core/static/admin/js/crawl_task_actions.js
Normal file
@@ -0,0 +1,84 @@
|
||||
/**
|
||||
* 爬取任务操作JavaScript
|
||||
*/
|
||||
|
||||
function startTask(taskId) {
|
||||
if (confirm('确定要启动这个任务吗?')) {
|
||||
fetch(`/admin/core/crawltask/${taskId}/start/`, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'X-CSRFToken': getCookie('csrftoken'),
|
||||
'Content-Type': 'application/x-www-form-urlencoded',
|
||||
},
|
||||
})
|
||||
.then(response => {
|
||||
if (response.ok) {
|
||||
location.reload();
|
||||
} else {
|
||||
alert('启动任务失败');
|
||||
}
|
||||
})
|
||||
.catch(error => {
|
||||
console.error('Error:', error);
|
||||
alert('启动任务失败');
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
function cancelTask(taskId) {
|
||||
if (confirm('确定要取消这个任务吗?')) {
|
||||
fetch(`/admin/core/crawltask/${taskId}/cancel/`, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'X-CSRFToken': getCookie('csrftoken'),
|
||||
'Content-Type': 'application/x-www-form-urlencoded',
|
||||
},
|
||||
})
|
||||
.then(response => {
|
||||
if (response.ok) {
|
||||
location.reload();
|
||||
} else {
|
||||
alert('取消任务失败');
|
||||
}
|
||||
})
|
||||
.catch(error => {
|
||||
console.error('Error:', error);
|
||||
alert('取消任务失败');
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
function viewResults(taskId) {
|
||||
window.open(`/admin/core/crawltask/${taskId}/results/`, '_blank');
|
||||
}
|
||||
|
||||
function getCookie(name) {
|
||||
let cookieValue = null;
|
||||
if (document.cookie && document.cookie !== '') {
|
||||
const cookies = document.cookie.split(';');
|
||||
for (let i = 0; i < cookies.length; i++) {
|
||||
const cookie = cookies[i].trim();
|
||||
if (cookie.substring(0, name.length + 1) === (name + '=')) {
|
||||
cookieValue = decodeURIComponent(cookie.substring(name.length + 1));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return cookieValue;
|
||||
}
|
||||
|
||||
// 自动刷新运行中的任务状态
|
||||
function autoRefreshRunningTasks() {
|
||||
const runningTasks = document.querySelectorAll('[data-task-status="running"]');
|
||||
if (runningTasks.length > 0) {
|
||||
// 每30秒刷新一次页面
|
||||
setTimeout(() => {
|
||||
location.reload();
|
||||
}, 30000);
|
||||
}
|
||||
}
|
||||
|
||||
// 页面加载完成后执行
|
||||
document.addEventListener('DOMContentLoaded', function() {
|
||||
autoRefreshRunningTasks();
|
||||
});
|
||||
235
core/task_executor.py
Normal file
235
core/task_executor.py
Normal file
@@ -0,0 +1,235 @@
|
||||
"""
|
||||
爬取任务执行器
|
||||
负责执行爬取任务并更新任务状态
|
||||
"""
|
||||
|
||||
import threading
|
||||
import time
|
||||
from django.utils import timezone
|
||||
from django.db import transaction
|
||||
from core.models import CrawlTask
|
||||
from core.utils import crawl_by_keyword, crawl_historical_articles, full_site_crawler, WEBSITE_SEARCH_CONFIGS
|
||||
|
||||
|
||||
class TaskExecutor:
|
||||
"""任务执行器"""
|
||||
|
||||
def __init__(self):
|
||||
self.running_tasks = {}
|
||||
self.lock = threading.Lock()
|
||||
|
||||
def start_task(self, task_id):
|
||||
"""启动任务"""
|
||||
with self.lock:
|
||||
if task_id in self.running_tasks:
|
||||
return False, "任务已在运行中"
|
||||
|
||||
try:
|
||||
task = CrawlTask.objects.get(id=task_id)
|
||||
if task.status != 'pending':
|
||||
return False, "任务状态不允许启动"
|
||||
|
||||
# 更新任务状态
|
||||
task.status = 'running'
|
||||
task.started_at = timezone.now()
|
||||
task.save()
|
||||
|
||||
# 启动后台线程执行任务
|
||||
thread = threading.Thread(target=self._execute_task, args=(task_id,))
|
||||
thread.daemon = True
|
||||
thread.start()
|
||||
|
||||
self.running_tasks[task_id] = thread
|
||||
return True, "任务已启动"
|
||||
|
||||
except CrawlTask.DoesNotExist:
|
||||
return False, "任务不存在"
|
||||
except Exception as e:
|
||||
return False, f"启动任务失败: {e}"
|
||||
|
||||
def cancel_task(self, task_id):
|
||||
"""取消任务"""
|
||||
with self.lock:
|
||||
if task_id in self.running_tasks:
|
||||
# 标记任务为取消状态
|
||||
try:
|
||||
task = CrawlTask.objects.get(id=task_id)
|
||||
task.status = 'cancelled'
|
||||
task.completed_at = timezone.now()
|
||||
task.save()
|
||||
|
||||
# 移除运行中的任务
|
||||
del self.running_tasks[task_id]
|
||||
return True, "任务已取消"
|
||||
except CrawlTask.DoesNotExist:
|
||||
return False, "任务不存在"
|
||||
else:
|
||||
return False, "任务未在运行中"
|
||||
|
||||
def _execute_task(self, task_id):
|
||||
"""执行任务的核心逻辑"""
|
||||
try:
|
||||
task = CrawlTask.objects.get(id=task_id)
|
||||
|
||||
# 根据任务类型执行不同的爬取逻辑
|
||||
if task.task_type == 'keyword':
|
||||
self._execute_keyword_task(task)
|
||||
elif task.task_type == 'historical':
|
||||
self._execute_historical_task(task)
|
||||
elif task.task_type == 'full_site':
|
||||
self._execute_full_site_task(task)
|
||||
else:
|
||||
raise ValueError(f"不支持的任务类型: {task.task_type}")
|
||||
|
||||
# 任务完成
|
||||
with transaction.atomic():
|
||||
task = CrawlTask.objects.select_for_update().get(id=task_id)
|
||||
task.status = 'completed'
|
||||
task.completed_at = timezone.now()
|
||||
task.progress = 100
|
||||
task.save()
|
||||
|
||||
except Exception as e:
|
||||
# 任务失败
|
||||
try:
|
||||
with transaction.atomic():
|
||||
task = CrawlTask.objects.select_for_update().get(id=task_id)
|
||||
task.status = 'failed'
|
||||
task.completed_at = timezone.now()
|
||||
task.error_message = str(e)
|
||||
task.save()
|
||||
except:
|
||||
pass
|
||||
|
||||
finally:
|
||||
# 清理运行中的任务记录
|
||||
with self.lock:
|
||||
if task_id in self.running_tasks:
|
||||
del self.running_tasks[task_id]
|
||||
|
||||
def _execute_keyword_task(self, task):
|
||||
"""执行关键词搜索任务"""
|
||||
# 更新当前操作
|
||||
task.current_action = "开始关键词搜索"
|
||||
task.save()
|
||||
|
||||
# 准备参数
|
||||
websites = task.websites if task.websites else list(WEBSITE_SEARCH_CONFIGS.keys())
|
||||
start_date = task.start_date.strftime('%Y-%m-%d') if task.start_date else None
|
||||
end_date = task.end_date.strftime('%Y-%m-%d') if task.end_date else None
|
||||
|
||||
# 执行爬取
|
||||
results = crawl_by_keyword(
|
||||
keyword=task.keyword,
|
||||
website_names=websites,
|
||||
max_pages=task.max_pages,
|
||||
start_date=start_date,
|
||||
end_date=end_date,
|
||||
max_articles=task.max_articles
|
||||
)
|
||||
|
||||
# 更新结果
|
||||
task.total_articles = results['total_articles']
|
||||
task.success_count = results['success_count']
|
||||
task.failed_count = results['failed_count']
|
||||
task.result_details = results['website_results']
|
||||
task.save()
|
||||
|
||||
def _execute_historical_task(self, task):
|
||||
"""执行历史文章任务"""
|
||||
# 更新当前操作
|
||||
task.current_action = "开始历史文章爬取"
|
||||
task.save()
|
||||
|
||||
# 准备参数
|
||||
websites = task.websites if task.websites else list(WEBSITE_SEARCH_CONFIGS.keys())
|
||||
start_date = task.start_date.strftime('%Y-%m-%d') if task.start_date else None
|
||||
end_date = task.end_date.strftime('%Y-%m-%d') if task.end_date else None
|
||||
|
||||
# 执行爬取
|
||||
results = crawl_historical_articles(
|
||||
website_names=websites,
|
||||
start_date=start_date,
|
||||
end_date=end_date,
|
||||
max_articles_per_site=task.max_articles
|
||||
)
|
||||
|
||||
# 更新结果
|
||||
task.total_articles = results['total_articles']
|
||||
task.success_count = results['success_count']
|
||||
task.failed_count = results['failed_count']
|
||||
task.result_details = results['website_results']
|
||||
task.save()
|
||||
|
||||
def _execute_full_site_task(self, task):
|
||||
"""执行全站爬取任务"""
|
||||
# 更新当前操作
|
||||
task.current_action = "开始全站爬取"
|
||||
task.save()
|
||||
|
||||
# 准备参数
|
||||
websites = task.websites if task.websites else list(WEBSITE_SEARCH_CONFIGS.keys())
|
||||
|
||||
total_websites = len(websites)
|
||||
completed_websites = 0
|
||||
|
||||
for website_name in websites:
|
||||
try:
|
||||
# 更新当前网站
|
||||
task.current_website = website_name
|
||||
task.current_action = f"正在爬取 {website_name}"
|
||||
task.save()
|
||||
|
||||
# 获取或创建网站对象
|
||||
from core.models import Website
|
||||
website, created = Website.objects.get_or_create(
|
||||
name=website_name,
|
||||
defaults={
|
||||
'base_url': WEBSITE_SEARCH_CONFIGS[website_name]["search_url"],
|
||||
'enabled': True
|
||||
}
|
||||
)
|
||||
|
||||
# 执行全站爬取
|
||||
full_site_crawler(
|
||||
WEBSITE_SEARCH_CONFIGS[website_name]["search_url"],
|
||||
website,
|
||||
max_pages=task.max_pages
|
||||
)
|
||||
|
||||
completed_websites += 1
|
||||
progress = int((completed_websites / total_websites) * 100)
|
||||
task.progress = progress
|
||||
task.save()
|
||||
|
||||
except Exception as e:
|
||||
# 记录错误但继续处理其他网站
|
||||
print(f"爬取网站 {website_name} 时出错: {e}")
|
||||
continue
|
||||
|
||||
# 更新最终结果
|
||||
task.total_articles = completed_websites # 这里可以改为实际爬取的文章数
|
||||
task.success_count = completed_websites
|
||||
task.failed_count = total_websites - completed_websites
|
||||
task.save()
|
||||
|
||||
def get_task_status(self, task_id):
|
||||
"""获取任务状态"""
|
||||
try:
|
||||
task = CrawlTask.objects.get(id=task_id)
|
||||
return {
|
||||
'status': task.status,
|
||||
'progress': task.progress,
|
||||
'current_website': task.current_website,
|
||||
'current_action': task.current_action,
|
||||
'total_articles': task.total_articles,
|
||||
'success_count': task.success_count,
|
||||
'failed_count': task.failed_count,
|
||||
'error_message': task.error_message
|
||||
}
|
||||
except CrawlTask.DoesNotExist:
|
||||
return None
|
||||
|
||||
|
||||
# 全局任务执行器实例
|
||||
task_executor = TaskExecutor()
|
||||
139
core/templates/admin/create_full_site_task.html
Normal file
139
core/templates/admin/create_full_site_task.html
Normal file
@@ -0,0 +1,139 @@
|
||||
{% extends "admin/base_site.html" %}
|
||||
{% load i18n admin_urls static admin_modify %}
|
||||
|
||||
{% block title %}{{ title }} | {{ site_title|default:_('Django site admin') }}{% endblock %}
|
||||
|
||||
{% block breadcrumbs %}
|
||||
<div class="breadcrumbs">
|
||||
<a href="{% url 'admin:index' %}">{% trans 'Home' %}</a>
|
||||
› <a href="{% url 'admin:core_crawltask_changelist' %}">爬取任务</a>
|
||||
› {{ title }}
|
||||
</div>
|
||||
{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<h1>{{ title }}</h1>
|
||||
|
||||
<div class="help" style="background: #fff3cd; border: 1px solid #ffeaa7; padding: 15px; margin-bottom: 20px; border-radius: 5px;">
|
||||
<strong>注意:</strong>全站爬取会爬取整个网站的所有文章,可能需要很长时间。建议在非高峰时段进行。
|
||||
</div>
|
||||
|
||||
<form method="post" id="full-site-task-form">
|
||||
{% csrf_token %}
|
||||
|
||||
<fieldset class="module aligned">
|
||||
<h2>基本信息</h2>
|
||||
<div class="form-row">
|
||||
<div>
|
||||
<label for="id_name" class="required">任务名称:</label>
|
||||
<input type="text" name="name" id="id_name" required maxlength="200" style="width: 300px;">
|
||||
<p class="help">为这个全站爬取任务起一个容易识别的名称</p>
|
||||
</div>
|
||||
</div>
|
||||
</fieldset>
|
||||
|
||||
<fieldset class="module aligned">
|
||||
<h2>目标网站</h2>
|
||||
<div class="form-row">
|
||||
<div>
|
||||
<label>选择要爬取的网站:</label>
|
||||
<div style="max-height: 200px; overflow-y: auto; border: 1px solid #ddd; padding: 10px; margin-top: 5px;">
|
||||
<label style="display: block; margin: 5px 0;">
|
||||
<input type="checkbox" id="select_all" onchange="toggleAllWebsites()">
|
||||
<strong>全选/取消全选</strong>
|
||||
</label>
|
||||
<hr style="margin: 10px 0;">
|
||||
{% for website in websites %}
|
||||
<label style="display: block; margin: 3px 0;">
|
||||
<input type="checkbox" name="websites" value="{{ website.name }}" class="website-checkbox">
|
||||
{{ website.name }}
|
||||
</label>
|
||||
{% endfor %}
|
||||
</div>
|
||||
<p class="help">不选择任何网站将爬取所有支持的网站</p>
|
||||
</div>
|
||||
</div>
|
||||
</fieldset>
|
||||
|
||||
<fieldset class="module aligned">
|
||||
<h2>爬取设置</h2>
|
||||
<div class="form-row">
|
||||
<div>
|
||||
<label for="id_max_pages">最大爬取页数:</label>
|
||||
<input type="number" name="max_pages" id="id_max_pages" value="500" min="1" max="5000" style="width: 100px;">
|
||||
<p class="help">每个网站最多爬取的页数 (1-5000)</p>
|
||||
</div>
|
||||
</div>
|
||||
</fieldset>
|
||||
|
||||
<div class="submit-row">
|
||||
<input type="submit" value="创建任务" class="default" name="_save">
|
||||
<a href="{% url 'admin:core_crawltask_changelist' %}" class="button cancel-link">取消</a>
|
||||
</div>
|
||||
</form>
|
||||
|
||||
<script>
|
||||
function toggleAllWebsites() {
|
||||
const selectAll = document.getElementById('select_all');
|
||||
const checkboxes = document.querySelectorAll('.website-checkbox');
|
||||
|
||||
checkboxes.forEach(checkbox => {
|
||||
checkbox.checked = selectAll.checked;
|
||||
});
|
||||
}
|
||||
</script>
|
||||
|
||||
<style>
|
||||
.form-row {
|
||||
margin-bottom: 15px;
|
||||
}
|
||||
|
||||
.form-row label {
|
||||
display: block;
|
||||
font-weight: bold;
|
||||
margin-bottom: 5px;
|
||||
}
|
||||
|
||||
.form-row input[type="text"],
|
||||
.form-row input[type="number"] {
|
||||
padding: 5px;
|
||||
border: 1px solid #ddd;
|
||||
border-radius: 3px;
|
||||
}
|
||||
|
||||
.form-row .help {
|
||||
color: #666;
|
||||
font-size: 12px;
|
||||
margin-top: 3px;
|
||||
}
|
||||
|
||||
.submit-row {
|
||||
margin-top: 20px;
|
||||
padding-top: 20px;
|
||||
border-top: 1px solid #ddd;
|
||||
}
|
||||
|
||||
.submit-row input[type="submit"] {
|
||||
background: #417690;
|
||||
color: white;
|
||||
padding: 10px 20px;
|
||||
border: none;
|
||||
border-radius: 3px;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
.submit-row .cancel-link {
|
||||
margin-left: 10px;
|
||||
padding: 10px 20px;
|
||||
background: #f8f8f8;
|
||||
color: #333;
|
||||
text-decoration: none;
|
||||
border-radius: 3px;
|
||||
border: 1px solid #ddd;
|
||||
}
|
||||
|
||||
.submit-row .cancel-link:hover {
|
||||
background: #e8e8e8;
|
||||
}
|
||||
</style>
|
||||
{% endblock %}
|
||||
164
core/templates/admin/create_historical_task.html
Normal file
164
core/templates/admin/create_historical_task.html
Normal file
@@ -0,0 +1,164 @@
|
||||
{% extends "admin/base_site.html" %}
|
||||
{% load i18n admin_urls static admin_modify %}
|
||||
|
||||
{% block title %}{{ title }} | {{ site_title|default:_('Django site admin') }}{% endblock %}
|
||||
|
||||
{% block breadcrumbs %}
|
||||
<div class="breadcrumbs">
|
||||
<a href="{% url 'admin:index' %}">{% trans 'Home' %}</a>
|
||||
› <a href="{% url 'admin:core_crawltask_changelist' %}">爬取任务</a>
|
||||
› {{ title }}
|
||||
</div>
|
||||
{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<h1>{{ title }}</h1>
|
||||
|
||||
<form method="post" id="historical-task-form">
|
||||
{% csrf_token %}
|
||||
|
||||
<fieldset class="module aligned">
|
||||
<h2>基本信息</h2>
|
||||
<div class="form-row">
|
||||
<div>
|
||||
<label for="id_name" class="required">任务名称:</label>
|
||||
<input type="text" name="name" id="id_name" required maxlength="200" style="width: 300px;">
|
||||
<p class="help">为这个历史文章爬取任务起一个容易识别的名称</p>
|
||||
</div>
|
||||
</div>
|
||||
</fieldset>
|
||||
|
||||
<fieldset class="module aligned">
|
||||
<h2>目标网站</h2>
|
||||
<div class="form-row">
|
||||
<div>
|
||||
<label>选择要爬取的网站:</label>
|
||||
<div style="max-height: 200px; overflow-y: auto; border: 1px solid #ddd; padding: 10px; margin-top: 5px;">
|
||||
<label style="display: block; margin: 5px 0;">
|
||||
<input type="checkbox" id="select_all" onchange="toggleAllWebsites()">
|
||||
<strong>全选/取消全选</strong>
|
||||
</label>
|
||||
<hr style="margin: 10px 0;">
|
||||
{% for website in websites %}
|
||||
<label style="display: block; margin: 3px 0;">
|
||||
<input type="checkbox" name="websites" value="{{ website.name }}" class="website-checkbox">
|
||||
{{ website.name }}
|
||||
</label>
|
||||
{% endfor %}
|
||||
</div>
|
||||
<p class="help">不选择任何网站将爬取所有支持的网站</p>
|
||||
</div>
|
||||
</div>
|
||||
</fieldset>
|
||||
|
||||
<fieldset class="module aligned">
|
||||
<h2>时间范围</h2>
|
||||
<div class="form-row">
|
||||
<div>
|
||||
<label for="id_start_date" class="required">开始日期:</label>
|
||||
<input type="date" name="start_date" id="id_start_date" required>
|
||||
<p class="help">历史文章的开始日期</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="form-row">
|
||||
<div>
|
||||
<label for="id_end_date" class="required">结束日期:</label>
|
||||
<input type="date" name="end_date" id="id_end_date" required>
|
||||
<p class="help">历史文章的结束日期</p>
|
||||
</div>
|
||||
</div>
|
||||
</fieldset>
|
||||
|
||||
<fieldset class="module aligned">
|
||||
<h2>爬取设置</h2>
|
||||
<div class="form-row">
|
||||
<div>
|
||||
<label for="id_max_articles">每个网站最大文章数:</label>
|
||||
<input type="number" name="max_articles" id="id_max_articles" value="50" min="1" max="500" style="width: 100px;">
|
||||
<p class="help">每个网站最多爬取的文章数量 (1-500)</p>
|
||||
</div>
|
||||
</div>
|
||||
</fieldset>
|
||||
|
||||
<div class="submit-row">
|
||||
<input type="submit" value="创建任务" class="default" name="_save">
|
||||
<a href="{% url 'admin:core_crawltask_changelist' %}" class="button cancel-link">取消</a>
|
||||
</div>
|
||||
</form>
|
||||
|
||||
<script>
|
||||
function toggleAllWebsites() {
|
||||
const selectAll = document.getElementById('select_all');
|
||||
const checkboxes = document.querySelectorAll('.website-checkbox');
|
||||
|
||||
checkboxes.forEach(checkbox => {
|
||||
checkbox.checked = selectAll.checked;
|
||||
});
|
||||
}
|
||||
|
||||
// 设置默认日期
|
||||
document.addEventListener('DOMContentLoaded', function() {
|
||||
const today = new Date();
|
||||
const oneMonthAgo = new Date(today.getFullYear(), today.getMonth() - 1, today.getDate());
|
||||
|
||||
document.getElementById('id_end_date').value = today.toISOString().split('T')[0];
|
||||
document.getElementById('id_start_date').value = oneMonthAgo.toISOString().split('T')[0];
|
||||
});
|
||||
</script>
|
||||
|
||||
<style>
|
||||
.form-row {
|
||||
margin-bottom: 15px;
|
||||
}
|
||||
|
||||
.form-row label {
|
||||
display: block;
|
||||
font-weight: bold;
|
||||
margin-bottom: 5px;
|
||||
}
|
||||
|
||||
.form-row input[type="text"],
|
||||
.form-row input[type="number"],
|
||||
.form-row input[type="date"] {
|
||||
padding: 5px;
|
||||
border: 1px solid #ddd;
|
||||
border-radius: 3px;
|
||||
}
|
||||
|
||||
.form-row .help {
|
||||
color: #666;
|
||||
font-size: 12px;
|
||||
margin-top: 3px;
|
||||
}
|
||||
|
||||
.submit-row {
|
||||
margin-top: 20px;
|
||||
padding-top: 20px;
|
||||
border-top: 1px solid #ddd;
|
||||
}
|
||||
|
||||
.submit-row input[type="submit"] {
|
||||
background: #417690;
|
||||
color: white;
|
||||
padding: 10px 20px;
|
||||
border: none;
|
||||
border-radius: 3px;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
.submit-row .cancel-link {
|
||||
margin-left: 10px;
|
||||
padding: 10px 20px;
|
||||
background: #f8f8f8;
|
||||
color: #333;
|
||||
text-decoration: none;
|
||||
border-radius: 3px;
|
||||
border: 1px solid #ddd;
|
||||
}
|
||||
|
||||
.submit-row .cancel-link:hover {
|
||||
background: #e8e8e8;
|
||||
}
|
||||
</style>
|
||||
{% endblock %}
|
||||
180
core/templates/admin/create_keyword_task.html
Normal file
180
core/templates/admin/create_keyword_task.html
Normal file
@@ -0,0 +1,180 @@
|
||||
{% extends "admin/base_site.html" %}
|
||||
{% load i18n admin_urls static admin_modify %}
|
||||
|
||||
{% block title %}{{ title }} | {{ site_title|default:_('Django site admin') }}{% endblock %}
|
||||
|
||||
{% block breadcrumbs %}
|
||||
<div class="breadcrumbs">
|
||||
<a href="{% url 'admin:index' %}">{% trans 'Home' %}</a>
|
||||
› <a href="{% url 'admin:core_crawltask_changelist' %}">爬取任务</a>
|
||||
› {{ title }}
|
||||
</div>
|
||||
{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<h1>{{ title }}</h1>
|
||||
|
||||
<form method="post" id="keyword-task-form">
|
||||
{% csrf_token %}
|
||||
|
||||
<fieldset class="module aligned">
|
||||
<h2>基本信息</h2>
|
||||
<div class="form-row">
|
||||
<div>
|
||||
<label for="id_name" class="required">任务名称:</label>
|
||||
<input type="text" name="name" id="id_name" required maxlength="200" style="width: 300px;">
|
||||
<p class="help">为这个爬取任务起一个容易识别的名称</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="form-row">
|
||||
<div>
|
||||
<label for="id_keyword" class="required">搜索关键词:</label>
|
||||
<input type="text" name="keyword" id="id_keyword" required maxlength="200" style="width: 300px;">
|
||||
<p class="help">输入要搜索的关键词,例如:人工智能、两会、政策等</p>
|
||||
</div>
|
||||
</div>
|
||||
</fieldset>
|
||||
|
||||
<fieldset class="module aligned">
|
||||
<h2>目标网站</h2>
|
||||
<div class="form-row">
|
||||
<div>
|
||||
<label>选择要爬取的网站:</label>
|
||||
<div style="max-height: 200px; overflow-y: auto; border: 1px solid #ddd; padding: 10px; margin-top: 5px;">
|
||||
<label style="display: block; margin: 5px 0;">
|
||||
<input type="checkbox" id="select_all" onchange="toggleAllWebsites()">
|
||||
<strong>全选/取消全选</strong>
|
||||
</label>
|
||||
<hr style="margin: 10px 0;">
|
||||
{% for website in websites %}
|
||||
<label style="display: block; margin: 3px 0;">
|
||||
<input type="checkbox" name="websites" value="{{ website.name }}" class="website-checkbox">
|
||||
{{ website.name }}
|
||||
</label>
|
||||
{% endfor %}
|
||||
</div>
|
||||
<p class="help">不选择任何网站将爬取所有支持的网站</p>
|
||||
</div>
|
||||
</div>
|
||||
</fieldset>
|
||||
|
||||
<fieldset class="module aligned">
|
||||
<h2>时间范围 (可选)</h2>
|
||||
<div class="form-row">
|
||||
<div>
|
||||
<label for="id_start_date">开始日期:</label>
|
||||
<input type="date" name="start_date" id="id_start_date">
|
||||
<p class="help">留空则搜索所有时间</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="form-row">
|
||||
<div>
|
||||
<label for="id_end_date">结束日期:</label>
|
||||
<input type="date" name="end_date" id="id_end_date">
|
||||
<p class="help">留空则搜索到当前时间</p>
|
||||
</div>
|
||||
</div>
|
||||
</fieldset>
|
||||
|
||||
<fieldset class="module aligned">
|
||||
<h2>爬取设置</h2>
|
||||
<div class="form-row">
|
||||
<div>
|
||||
<label for="id_max_pages">最大搜索页数:</label>
|
||||
<input type="number" name="max_pages" id="id_max_pages" value="10" min="1" max="100" style="width: 100px;">
|
||||
<p class="help">每个网站最多搜索的页数 (1-100)</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="form-row">
|
||||
<div>
|
||||
<label for="id_max_articles">最大文章数量:</label>
|
||||
<input type="number" name="max_articles" id="id_max_articles" value="100" min="1" max="1000" style="width: 100px;">
|
||||
<p class="help">总共最多爬取的文章数量 (1-1000)</p>
|
||||
</div>
|
||||
</div>
|
||||
</fieldset>
|
||||
|
||||
<div class="submit-row">
|
||||
<input type="submit" value="创建任务" class="default" name="_save">
|
||||
<a href="{% url 'admin:core_crawltask_changelist' %}" class="button cancel-link">取消</a>
|
||||
</div>
|
||||
</form>
|
||||
|
||||
<script>
|
||||
function toggleAllWebsites() {
|
||||
const selectAll = document.getElementById('select_all');
|
||||
const checkboxes = document.querySelectorAll('.website-checkbox');
|
||||
|
||||
checkboxes.forEach(checkbox => {
|
||||
checkbox.checked = selectAll.checked;
|
||||
});
|
||||
}
|
||||
|
||||
// 设置默认日期
|
||||
document.addEventListener('DOMContentLoaded', function() {
|
||||
const today = new Date();
|
||||
const oneMonthAgo = new Date(today.getFullYear(), today.getMonth() - 1, today.getDate());
|
||||
|
||||
document.getElementById('id_end_date').value = today.toISOString().split('T')[0];
|
||||
document.getElementById('id_start_date').value = oneMonthAgo.toISOString().split('T')[0];
|
||||
});
|
||||
</script>
|
||||
|
||||
<style>
|
||||
.form-row {
|
||||
margin-bottom: 15px;
|
||||
}
|
||||
|
||||
.form-row label {
|
||||
display: block;
|
||||
font-weight: bold;
|
||||
margin-bottom: 5px;
|
||||
}
|
||||
|
||||
.form-row input[type="text"],
|
||||
.form-row input[type="number"],
|
||||
.form-row input[type="date"] {
|
||||
padding: 5px;
|
||||
border: 1px solid #ddd;
|
||||
border-radius: 3px;
|
||||
}
|
||||
|
||||
.form-row .help {
|
||||
color: #666;
|
||||
font-size: 12px;
|
||||
margin-top: 3px;
|
||||
}
|
||||
|
||||
.submit-row {
|
||||
margin-top: 20px;
|
||||
padding-top: 20px;
|
||||
border-top: 1px solid #ddd;
|
||||
}
|
||||
|
||||
.submit-row input[type="submit"] {
|
||||
background: #417690;
|
||||
color: white;
|
||||
padding: 10px 20px;
|
||||
border: none;
|
||||
border-radius: 3px;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
.submit-row .cancel-link {
|
||||
margin-left: 10px;
|
||||
padding: 10px 20px;
|
||||
background: #f8f8f8;
|
||||
color: #333;
|
||||
text-decoration: none;
|
||||
border-radius: 3px;
|
||||
border: 1px solid #ddd;
|
||||
}
|
||||
|
||||
.submit-row .cancel-link:hover {
|
||||
background: #e8e8e8;
|
||||
}
|
||||
</style>
|
||||
{% endblock %}
|
||||
172
core/templates/admin/index.html
Normal file
172
core/templates/admin/index.html
Normal file
@@ -0,0 +1,172 @@
|
||||
{% extends "admin/base_site.html" %}
|
||||
{% load i18n static %}
|
||||
|
||||
{% block extrastyle %}{{ block.super }}<link rel="stylesheet" type="text/css" href="{% static "admin/css/dashboard.css" %}">{% endblock %}
|
||||
|
||||
{% block coltype %}colMS{% endblock %}
|
||||
|
||||
{% block bodyclass %}{{ block.super }} dashboard{% endblock %}
|
||||
|
||||
{% block breadcrumbs %}{% endblock %}
|
||||
|
||||
{% block nav-sidebar %}{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<div id="content-main">
|
||||
|
||||
{% if app_list %}
|
||||
{% for app in app_list %}
|
||||
<div class="app-{{ app.app_label }} module">
|
||||
<table>
|
||||
<caption>
|
||||
<a href="{{ app.app_url }}" class="section" title="{% blocktranslate with name=app.name %}Models in the {{ name }} application{% endblocktranslate %}">{{ app.name }}</a>
|
||||
</caption>
|
||||
{% for model in app.models %}
|
||||
<tr class="model-{{ model.object_name|lower }}">
|
||||
{% if model.admin_url %}
|
||||
<th scope="row"><a href="{{ model.admin_url }}"{% if model.add_url %} class="addlink"{% endif %}>{{ model.name }}</a></th>
|
||||
{% else %}
|
||||
<th scope="row">{{ model.name }}</th>
|
||||
{% endif %}
|
||||
|
||||
{% if model.add_url %}
|
||||
<td><a href="{{ model.add_url }}" class="addlink">{% translate 'Add' %}</a></td>
|
||||
{% else %}
|
||||
<td> </td>
|
||||
{% endif %}
|
||||
|
||||
{% if model.admin_url %}
|
||||
{% if model.view_only %}
|
||||
<td><a href="{{ model.admin_url }}" class="viewlink">{% translate 'View' %}</a></td>
|
||||
{% else %}
|
||||
<td><a href="{{ model.admin_url }}" class="changelink">{% translate 'Change' %}</a></td>
|
||||
{% endif %}
|
||||
{% else %}
|
||||
<td> </td>
|
||||
{% endif %}
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</table>
|
||||
</div>
|
||||
{% endfor %}
|
||||
{% else %}
|
||||
<p>{% translate "You don't have permission to view or edit anything." %}</p>
|
||||
{% endif %}
|
||||
|
||||
<!-- 自定义快速操作区域 -->
|
||||
<div class="module" style="margin-top: 20px;">
|
||||
<h2>快速创建爬取任务</h2>
|
||||
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 15px; margin-top: 15px;">
|
||||
<div style="border: 1px solid #ddd; padding: 15px; border-radius: 5px; text-align: center;">
|
||||
<h3 style="margin-top: 0; color: #417690;">关键词搜索</h3>
|
||||
<p style="color: #666; font-size: 14px;">根据关键词搜索并爬取相关文章</p>
|
||||
<a href="{% url 'admin:create_keyword_task' %}" class="button" style="background: #417690; color: white; padding: 8px 16px; text-decoration: none; border-radius: 3px; display: inline-block;">
|
||||
创建任务
|
||||
</a>
|
||||
</div>
|
||||
|
||||
<div style="border: 1px solid #ddd; padding: 15px; border-radius: 5px; text-align: center;">
|
||||
<h3 style="margin-top: 0; color: #28a745;">历史文章</h3>
|
||||
<p style="color: #666; font-size: 14px;">爬取指定日期范围的历史文章</p>
|
||||
<a href="{% url 'admin:create_historical_task' %}" class="button" style="background: #28a745; color: white; padding: 8px 16px; text-decoration: none; border-radius: 3px; display: inline-block;">
|
||||
创建任务
|
||||
</a>
|
||||
</div>
|
||||
|
||||
<div style="border: 1px solid #ddd; padding: 15px; border-radius: 5px; text-align: center;">
|
||||
<h3 style="margin-top: 0; color: #dc3545;">全站爬取</h3>
|
||||
<p style="color: #666; font-size: 14px;">爬取整个网站的所有文章</p>
|
||||
<a href="{% url 'admin:create_full_site_task' %}" class="button" style="background: #dc3545; color: white; padding: 8px 16px; text-decoration: none; border-radius: 3px; display: inline-block;">
|
||||
创建任务
|
||||
</a>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- 最近任务状态 -->
|
||||
<div class="module" style="margin-top: 20px;">
|
||||
<h2>最近任务状态</h2>
|
||||
<div style="margin-top: 15px;">
|
||||
{% load core_extras %}
|
||||
{% get_recent_tasks as recent_tasks %}
|
||||
{% if recent_tasks %}
|
||||
<table style="width: 100%;">
|
||||
<thead>
|
||||
<tr style="background: #f8f9fa;">
|
||||
<th style="padding: 8px; text-align: left;">任务名称</th>
|
||||
<th style="padding: 8px; text-align: left;">类型</th>
|
||||
<th style="padding: 8px; text-align: left;">状态</th>
|
||||
<th style="padding: 8px; text-align: left;">进度</th>
|
||||
<th style="padding: 8px; text-align: left;">创建时间</th>
|
||||
<th style="padding: 8px; text-align: left;">操作</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for task in recent_tasks %}
|
||||
<tr>
|
||||
<td style="padding: 8px;">{{ task.name }}</td>
|
||||
<td style="padding: 8px;">{{ task.get_task_type_display }}</td>
|
||||
<td style="padding: 8px;">
|
||||
<span style="color: {% if task.status == 'completed' %}green{% elif task.status == 'failed' %}red{% elif task.status == 'running' %}blue{% else %}gray{% endif %};">
|
||||
{{ task.get_status_display }}
|
||||
</span>
|
||||
</td>
|
||||
<td style="padding: 8px;">
|
||||
{% if task.status == 'running' %}
|
||||
<div style="width: 100px; background-color: #f0f0f0; border-radius: 3px; overflow: hidden;">
|
||||
<div style="width: {{ task.progress }}%; background-color: #4CAF50; height: 16px; text-align: center; line-height: 16px; color: white; font-size: 12px;">
|
||||
{{ task.progress }}%
|
||||
</div>
|
||||
</div>
|
||||
{% else %}
|
||||
-
|
||||
{% endif %}
|
||||
</td>
|
||||
<td style="padding: 8px;">{{ task.created_at|date:"m-d H:i" }}</td>
|
||||
<td style="padding: 8px;">
|
||||
<a href="{% url 'admin:core_crawltask_change' task.id %}" style="color: #417690; text-decoration: none;">查看</a>
|
||||
</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
{% else %}
|
||||
<p style="color: #666; text-align: center; padding: 20px;">暂无任务</p>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
{% endblock %}
|
||||
|
||||
{% block sidebar %}
|
||||
<div id="content-related">
|
||||
<div class="module" id="recent-actions-module">
|
||||
<h2>{% translate 'Recent actions' %}</h2>
|
||||
<h3>{% translate 'My actions' %}</h3>
|
||||
{% load log %}
|
||||
{% get_admin_log 10 as admin_log for_user user %}
|
||||
{% if not admin_log %}
|
||||
<p>{% translate 'None available' %}</p>
|
||||
{% else %}
|
||||
<ul class="actionlist">
|
||||
{% for entry in admin_log %}
|
||||
<li class="{% if entry.is_addition %}addlink{% endif %}{% if entry.is_change %}changelink{% endif %}{% if entry.is_deletion %}deletelink{% endif %}">
|
||||
{% if entry.is_deletion or not entry.get_admin_url %}
|
||||
{{ entry.object_repr }}
|
||||
{% else %}
|
||||
<a href="{{ entry.get_admin_url }}">{{ entry.object_repr }}</a>
|
||||
{% endif %}
|
||||
<br>
|
||||
{% if entry.content_type %}
|
||||
<span class="mini quiet">{% filter capfirst %}{{ entry.content_type.name }}{% endfilter %}</span>
|
||||
{% else %}
|
||||
<span class="mini quiet">{% translate 'Unknown content' %}</span>
|
||||
{% endif %}
|
||||
</li>
|
||||
{% endfor %}
|
||||
</ul>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
{% endblock %}
|
||||
184
core/templates/admin/task_results.html
Normal file
184
core/templates/admin/task_results.html
Normal file
@@ -0,0 +1,184 @@
|
||||
{% extends "admin/base_site.html" %}
|
||||
{% load i18n admin_urls static admin_modify %}
|
||||
|
||||
{% block title %}{{ title }} | {{ site_title|default:_('Django site admin') }}{% endblock %}
|
||||
|
||||
{% block breadcrumbs %}
|
||||
<div class="breadcrumbs">
|
||||
<a href="{% url 'admin:index' %}">{% trans 'Home' %}</a>
|
||||
› <a href="{% url 'admin:core_crawltask_changelist' %}">爬取任务</a>
|
||||
› {{ title }}
|
||||
</div>
|
||||
{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<h1>{{ title }}</h1>
|
||||
|
||||
<div class="results-summary" style="background: #f8f9fa; border: 1px solid #dee2e6; padding: 20px; margin-bottom: 20px; border-radius: 5px;">
|
||||
<h2>任务概览</h2>
|
||||
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px;">
|
||||
<div>
|
||||
<strong>任务名称:</strong><br>
|
||||
{{ task.name }}
|
||||
</div>
|
||||
<div>
|
||||
<strong>任务类型:</strong><br>
|
||||
{{ task.get_task_type_display }}
|
||||
</div>
|
||||
<div>
|
||||
<strong>状态:</strong><br>
|
||||
<span style="color: {% if task.status == 'completed' %}green{% elif task.status == 'failed' %}red{% elif task.status == 'running' %}blue{% else %}gray{% endif %};">
|
||||
{{ task.get_status_display }}
|
||||
</span>
|
||||
</div>
|
||||
<div>
|
||||
<strong>创建时间:</strong><br>
|
||||
{{ task.created_at|date:"Y-m-d H:i:s" }}
|
||||
</div>
|
||||
{% if task.started_at %}
|
||||
<div>
|
||||
<strong>开始时间:</strong><br>
|
||||
{{ task.started_at|date:"Y-m-d H:i:s" }}
|
||||
</div>
|
||||
{% endif %}
|
||||
{% if task.completed_at %}
|
||||
<div>
|
||||
<strong>完成时间:</strong><br>
|
||||
{{ task.completed_at|date:"Y-m-d H:i:s" }}
|
||||
</div>
|
||||
{% endif %}
|
||||
{% if task.get_duration %}
|
||||
<div>
|
||||
<strong>执行时长:</strong><br>
|
||||
{{ task.duration_display }}
|
||||
</div>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="results-stats" style="background: #fff; border: 1px solid #dee2e6; padding: 20px; margin-bottom: 20px; border-radius: 5px;">
|
||||
<h2>统计信息</h2>
|
||||
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); gap: 15px;">
|
||||
<div style="text-align: center; padding: 15px; background: #e3f2fd; border-radius: 5px;">
|
||||
<div style="font-size: 24px; font-weight: bold; color: #1976d2;">{{ task.total_articles }}</div>
|
||||
<div>总文章数</div>
|
||||
</div>
|
||||
<div style="text-align: center; padding: 15px; background: #e8f5e8; border-radius: 5px;">
|
||||
<div style="font-size: 24px; font-weight: bold; color: #388e3c;">{{ task.success_count }}</div>
|
||||
<div>成功数</div>
|
||||
</div>
|
||||
<div style="text-align: center; padding: 15px; background: #ffebee; border-radius: 5px;">
|
||||
<div style="font-size: 24px; font-weight: bold; color: #d32f2f;">{{ task.failed_count }}</div>
|
||||
<div>失败数</div>
|
||||
</div>
|
||||
{% if task.total_articles > 0 %}
|
||||
<div style="text-align: center; padding: 15px; background: #fff3e0; border-radius: 5px;">
|
||||
<div style="font-size: 24px; font-weight: bold; color: #f57c00;">
|
||||
{% widthratio task.success_count task.total_articles 100 %}%
|
||||
</div>
|
||||
<div>成功率</div>
|
||||
</div>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{% if task.keyword %}
|
||||
<div class="task-config" style="background: #fff; border: 1px solid #dee2e6; padding: 20px; margin-bottom: 20px; border-radius: 5px;">
|
||||
<h2>任务配置</h2>
|
||||
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px;">
|
||||
<div>
|
||||
<strong>搜索关键词:</strong><br>
|
||||
{{ task.keyword }}
|
||||
</div>
|
||||
<div>
|
||||
<strong>目标网站:</strong><br>
|
||||
{{ task.get_websites_display }}
|
||||
</div>
|
||||
{% if task.start_date %}
|
||||
<div>
|
||||
<strong>开始日期:</strong><br>
|
||||
{{ task.start_date }}
|
||||
</div>
|
||||
{% endif %}
|
||||
{% if task.end_date %}
|
||||
<div>
|
||||
<strong>结束日期:</strong><br>
|
||||
{{ task.end_date }}
|
||||
</div>
|
||||
{% endif %}
|
||||
<div>
|
||||
<strong>最大页数:</strong><br>
|
||||
{{ task.max_pages }}
|
||||
</div>
|
||||
<div>
|
||||
<strong>最大文章数:</strong><br>
|
||||
{{ task.max_articles }}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{% if task.current_website or task.current_action %}
|
||||
<div class="current-status" style="background: #fff; border: 1px solid #dee2e6; padding: 20px; margin-bottom: 20px; border-radius: 5px;">
|
||||
<h2>当前状态</h2>
|
||||
{% if task.current_website %}
|
||||
<div>
|
||||
<strong>当前网站:</strong> {{ task.current_website }}
|
||||
</div>
|
||||
{% endif %}
|
||||
{% if task.current_action %}
|
||||
<div>
|
||||
<strong>当前操作:</strong> {{ task.current_action }}
|
||||
</div>
|
||||
{% endif %}
|
||||
{% if task.status == 'running' %}
|
||||
<div style="margin-top: 10px;">
|
||||
<div style="width: 100%; background-color: #f0f0f0; border-radius: 10px; overflow: hidden;">
|
||||
<div style="width: {{ task.progress }}%; background-color: #4CAF50; height: 20px; text-align: center; line-height: 20px; color: white;">
|
||||
{{ task.progress }}%
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{% endif %}
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{% if task.error_message %}
|
||||
<div class="error-info" style="background: #ffebee; border: 1px solid #f44336; padding: 20px; margin-bottom: 20px; border-radius: 5px;">
|
||||
<h2 style="color: #d32f2f;">错误信息</h2>
|
||||
<pre style="white-space: pre-wrap; word-wrap: break-word;">{{ task.error_message }}</pre>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{% if task.result_details %}
|
||||
<div class="detailed-results" style="background: #fff; border: 1px solid #dee2e6; padding: 20px; margin-bottom: 20px; border-radius: 5px;">
|
||||
<h2>详细结果</h2>
|
||||
{% for website, result in task.result_details.items %}
|
||||
<div style="margin-bottom: 15px; padding: 10px; background: #f8f9fa; border-radius: 3px;">
|
||||
<strong>{{ website }}:</strong>
|
||||
<ul style="margin: 5px 0; padding-left: 20px;">
|
||||
<li>找到链接: {{ result.found_urls }}</li>
|
||||
<li>已处理: {{ result.processed }}</li>
|
||||
<li>成功: {{ result.success }}</li>
|
||||
<li>失败: {{ result.failed }}</li>
|
||||
{% if result.error %}
|
||||
<li style="color: red;">错误: {{ result.error }}</li>
|
||||
{% endif %}
|
||||
</ul>
|
||||
</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
<div class="actions" style="text-align: center; margin-top: 30px;">
|
||||
<a href="{% url 'admin:core_crawltask_changelist' %}" class="button" style="padding: 10px 20px; background: #417690; color: white; text-decoration: none; border-radius: 3px; margin-right: 10px;">
|
||||
返回任务列表
|
||||
</a>
|
||||
{% if task.status == 'completed' %}
|
||||
<a href="{% url 'admin:core_article_changelist' %}" class="button" style="padding: 10px 20px; background: #28a745; color: white; text-decoration: none; border-radius: 3px;">
|
||||
查看文章
|
||||
</a>
|
||||
{% endif %}
|
||||
</div>
|
||||
|
||||
{% endblock %}
|
||||
0
core/templatetags/__init__.py
Normal file
0
core/templatetags/__init__.py
Normal file
46
core/templatetags/core_extras.py
Normal file
46
core/templatetags/core_extras.py
Normal file
@@ -0,0 +1,46 @@
|
||||
from django import template
|
||||
from django.core.cache import cache
|
||||
from core.models import CrawlTask
|
||||
|
||||
register = template.Library()
|
||||
|
||||
|
||||
@register.simple_tag
|
||||
def get_recent_tasks(limit=5):
|
||||
"""获取最近的任务"""
|
||||
cache_key = f'recent_tasks_{limit}'
|
||||
recent_tasks = cache.get(cache_key)
|
||||
|
||||
if recent_tasks is None:
|
||||
recent_tasks = CrawlTask.objects.all()[:limit]
|
||||
cache.set(cache_key, recent_tasks, 60) # 缓存1分钟
|
||||
|
||||
return recent_tasks
|
||||
|
||||
|
||||
@register.filter
|
||||
def task_status_color(status):
|
||||
"""根据任务状态返回颜色"""
|
||||
color_map = {
|
||||
'pending': 'gray',
|
||||
'running': 'blue',
|
||||
'completed': 'green',
|
||||
'failed': 'red',
|
||||
'cancelled': 'orange',
|
||||
}
|
||||
return color_map.get(status, 'gray')
|
||||
|
||||
|
||||
@register.filter
|
||||
def task_progress_bar(progress):
|
||||
"""生成进度条HTML"""
|
||||
if progress is None:
|
||||
progress = 0
|
||||
|
||||
return f'''
|
||||
<div style="width: 100px; background-color: #f0f0f0; border-radius: 3px; overflow: hidden;">
|
||||
<div style="width: {progress}%; background-color: #4CAF50; height: 16px; text-align: center; line-height: 16px; color: white; font-size: 12px;">
|
||||
{progress}%
|
||||
</div>
|
||||
</div>
|
||||
'''
|
||||
696
core/utils.py
696
core/utils.py
@@ -1,7 +1,7 @@
|
||||
import os
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urljoin, urlparse
|
||||
from urllib.parse import urljoin, urlparse, quote
|
||||
from collections import deque
|
||||
from django.utils import timezone
|
||||
from django.conf import settings
|
||||
@@ -15,6 +15,8 @@ from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from webdriver_manager.chrome import ChromeDriverManager
|
||||
from datetime import datetime, timedelta
|
||||
import json
|
||||
|
||||
|
||||
def get_selenium_driver():
|
||||
@@ -2270,3 +2272,695 @@ def full_site_crawler(start_url, website, max_pages=1000):
|
||||
queue.append(href)
|
||||
elif href not in visited and is_valid_url(href, base_netloc):
|
||||
queue.append(href)
|
||||
|
||||
|
||||
# 网站搜索配置
|
||||
WEBSITE_SEARCH_CONFIGS = {
|
||||
"新华网": {
|
||||
"search_url": "http://so.news.cn/getNews",
|
||||
"search_params": {
|
||||
"keyword": "{keyword}",
|
||||
"curPage": "{page}",
|
||||
"sortField": "0",
|
||||
"sortType": "1"
|
||||
},
|
||||
"method": "post",
|
||||
"headers": {
|
||||
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
||||
}
|
||||
},
|
||||
"人民日报": {
|
||||
"search_url": "http://search.people.com.cn/search",
|
||||
"search_params": {
|
||||
"keyword": "{keyword}",
|
||||
"st": "0",
|
||||
"startDate": "{start_date}",
|
||||
"endDate": "{end_date}",
|
||||
"page": "{page}"
|
||||
},
|
||||
"method": "get"
|
||||
},
|
||||
"央视网": {
|
||||
"search_url": "https://search.cctv.com/search.php",
|
||||
"search_params": {
|
||||
"qtext": "{keyword}",
|
||||
"type": "web",
|
||||
"page": "{page}"
|
||||
},
|
||||
"method": "get"
|
||||
},
|
||||
"光明日报": {
|
||||
"search_url": "http://search.gmw.cn/search",
|
||||
"search_params": {
|
||||
"q": "{keyword}",
|
||||
"page": "{page}"
|
||||
},
|
||||
"method": "get"
|
||||
},
|
||||
"经济日报": {
|
||||
"search_url": "http://www.ce.cn/search",
|
||||
"search_params": {
|
||||
"keyword": "{keyword}",
|
||||
"page": "{page}"
|
||||
},
|
||||
"method": "get"
|
||||
},
|
||||
"中国日报": {
|
||||
"search_url": "http://www.chinadaily.com.cn/search",
|
||||
"search_params": {
|
||||
"q": "{keyword}",
|
||||
"page": "{page}"
|
||||
},
|
||||
"method": "get"
|
||||
},
|
||||
"工人日报": {
|
||||
"search_url": "https://www.workercn.cn/search",
|
||||
"search_params": {
|
||||
"keyword": "{keyword}",
|
||||
"page": "{page}"
|
||||
},
|
||||
"method": "get"
|
||||
},
|
||||
"科技日报": {
|
||||
"search_url": "http://www.stdaily.com/search",
|
||||
"search_params": {
|
||||
"keyword": "{keyword}",
|
||||
"page": "{page}"
|
||||
},
|
||||
"method": "get"
|
||||
},
|
||||
"人民政协网": {
|
||||
"search_url": "https://www.rmzxw.com.cn/search",
|
||||
"search_params": {
|
||||
"keyword": "{keyword}",
|
||||
"page": "{page}"
|
||||
},
|
||||
"method": "get"
|
||||
},
|
||||
"中国纪检监察报": {
|
||||
"search_url": "http://www.jjjcb.cn/search",
|
||||
"search_params": {
|
||||
"keyword": "{keyword}",
|
||||
"page": "{page}"
|
||||
},
|
||||
"method": "get"
|
||||
},
|
||||
"中国新闻社": {
|
||||
"search_url": "https://www.chinanews.com.cn/search",
|
||||
"search_params": {
|
||||
"keyword": "{keyword}",
|
||||
"page": "{page}"
|
||||
},
|
||||
"method": "get"
|
||||
},
|
||||
"学习时报": {
|
||||
"search_url": "https://www.studytimes.cn/search",
|
||||
"search_params": {
|
||||
"keyword": "{keyword}",
|
||||
"page": "{page}"
|
||||
},
|
||||
"method": "get"
|
||||
},
|
||||
"中国青年报": {
|
||||
"search_url": "http://news.cyol.com/search",
|
||||
"search_params": {
|
||||
"keyword": "{keyword}",
|
||||
"page": "{page}"
|
||||
},
|
||||
"method": "get"
|
||||
},
|
||||
"中国妇女报": {
|
||||
"search_url": "https://www.cnwomen.com.cn/search",
|
||||
"search_params": {
|
||||
"keyword": "{keyword}",
|
||||
"page": "{page}"
|
||||
},
|
||||
"method": "get"
|
||||
},
|
||||
"法治日报": {
|
||||
"search_url": "http://www.legaldaily.com.cn/search",
|
||||
"search_params": {
|
||||
"keyword": "{keyword}",
|
||||
"page": "{page}"
|
||||
},
|
||||
"method": "get"
|
||||
},
|
||||
"农民日报": {
|
||||
"search_url": "https://www.farmer.com.cn/search",
|
||||
"search_params": {
|
||||
"keyword": "{keyword}",
|
||||
"page": "{page}"
|
||||
},
|
||||
"method": "get"
|
||||
},
|
||||
"学习强国": {
|
||||
"search_url": "https://www.xuexi.cn/search",
|
||||
"search_params": {
|
||||
"keyword": "{keyword}",
|
||||
"page": "{page}"
|
||||
},
|
||||
"method": "get"
|
||||
},
|
||||
"旗帜网": {
|
||||
"search_url": "http://www.qizhiwang.org.cn/search",
|
||||
"search_params": {
|
||||
"keyword": "{keyword}",
|
||||
"page": "{page}"
|
||||
},
|
||||
"method": "get"
|
||||
},
|
||||
"中国网": {
|
||||
"search_url": "http://www.china.com.cn/search",
|
||||
"search_params": {
|
||||
"keyword": "{keyword}",
|
||||
"page": "{page}"
|
||||
},
|
||||
"method": "get"
|
||||
},
|
||||
"中国政府网": {
|
||||
"search_url": "https://www.gov.cn/search",
|
||||
"search_params": {
|
||||
"keyword": "{keyword}",
|
||||
"page": "{page}"
|
||||
},
|
||||
"method": "get"
|
||||
},
|
||||
"求是网": {
|
||||
"search_url": "http://www.qstheory.cn/search",
|
||||
"search_params": {
|
||||
"keyword": "{keyword}",
|
||||
"page": "{page}"
|
||||
},
|
||||
"method": "get"
|
||||
},
|
||||
"解放军报": {
|
||||
"search_url": "http://www.81.cn/search",
|
||||
"search_params": {
|
||||
"keyword": "{keyword}",
|
||||
"page": "{page}"
|
||||
},
|
||||
"method": "get"
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def search_articles_by_keyword(website_name, keyword, max_pages=10, start_date=None, end_date=None):
|
||||
"""
|
||||
根据关键词搜索文章
|
||||
|
||||
Args:
|
||||
website_name: 网站名称
|
||||
keyword: 搜索关键词
|
||||
max_pages: 最大搜索页数
|
||||
start_date: 开始日期 (YYYY-MM-DD)
|
||||
end_date: 结束日期 (YYYY-MM-DD)
|
||||
|
||||
Returns:
|
||||
list: 搜索到的文章URL列表
|
||||
"""
|
||||
if website_name not in WEBSITE_SEARCH_CONFIGS:
|
||||
print(f"网站 {website_name} 不支持搜索功能")
|
||||
return []
|
||||
|
||||
config = WEBSITE_SEARCH_CONFIGS[website_name]
|
||||
article_urls = []
|
||||
|
||||
# 设置默认日期范围
|
||||
if not start_date:
|
||||
start_date = (datetime.now() - timedelta(days=365)).strftime("%Y-%m-%d")
|
||||
if not end_date:
|
||||
end_date = datetime.now().strftime("%Y-%m-%d")
|
||||
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
||||
}
|
||||
headers.update(config.get("headers", {}))
|
||||
|
||||
for page in range(1, max_pages + 1):
|
||||
try:
|
||||
# 构建搜索参数
|
||||
search_params = {}
|
||||
for key, value in config["search_params"].items():
|
||||
search_params[key] = value.format(
|
||||
keyword=quote(keyword),
|
||||
page=page,
|
||||
start_date=start_date,
|
||||
end_date=end_date
|
||||
)
|
||||
|
||||
print(f"搜索 {website_name} 第 {page} 页: {keyword}")
|
||||
|
||||
if config["method"] == "post":
|
||||
response = requests.post(
|
||||
config["search_url"],
|
||||
data=search_params,
|
||||
headers=headers,
|
||||
timeout=15
|
||||
)
|
||||
else:
|
||||
response = requests.get(
|
||||
config["search_url"],
|
||||
params=search_params,
|
||||
headers=headers,
|
||||
timeout=15
|
||||
)
|
||||
|
||||
response.raise_for_status()
|
||||
response.encoding = 'utf-8'
|
||||
|
||||
# 解析搜索结果
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
page_urls = extract_search_results(soup, website_name)
|
||||
|
||||
if not page_urls:
|
||||
print(f"第 {page} 页没有找到更多结果")
|
||||
break
|
||||
|
||||
article_urls.extend(page_urls)
|
||||
print(f"第 {page} 页找到 {len(page_urls)} 篇文章")
|
||||
|
||||
# 避免请求过快
|
||||
time.sleep(1)
|
||||
|
||||
except Exception as e:
|
||||
print(f"搜索第 {page} 页时出错: {e}")
|
||||
continue
|
||||
|
||||
print(f"总共找到 {len(article_urls)} 篇文章")
|
||||
return article_urls
|
||||
|
||||
|
||||
def extract_search_results(soup, website_name):
|
||||
"""
|
||||
从搜索结果页面提取文章链接
|
||||
|
||||
Args:
|
||||
soup: BeautifulSoup对象
|
||||
website_name: 网站名称
|
||||
|
||||
Returns:
|
||||
list: 文章URL列表
|
||||
"""
|
||||
urls = []
|
||||
|
||||
# 根据不同网站的搜索结果结构提取链接
|
||||
if website_name == "新华网":
|
||||
# 新华网搜索结果结构
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
if "/news/" in href or "/article/" in href:
|
||||
urls.append(href)
|
||||
|
||||
elif website_name == "人民日报":
|
||||
# 人民日报搜索结果结构
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
if "/n1/" in href or "/article/" in href:
|
||||
urls.append(href)
|
||||
|
||||
elif website_name == "央视网":
|
||||
# 央视网搜索结果结构
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
if "/news/" in href or "ARTI" in href:
|
||||
urls.append(href)
|
||||
|
||||
elif website_name == "光明日报":
|
||||
# 光明日报搜索结果结构
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
if "/article/" in href or "/content/" in href:
|
||||
urls.append(href)
|
||||
|
||||
elif website_name == "经济日报":
|
||||
# 经济日报搜索结果结构
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
if "/article/" in href or "/content/" in href:
|
||||
urls.append(href)
|
||||
|
||||
elif website_name == "中国日报":
|
||||
# 中国日报搜索结果结构
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
if "/article/" in href or "/content/" in href:
|
||||
urls.append(href)
|
||||
|
||||
elif website_name == "工人日报":
|
||||
# 工人日报搜索结果结构
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
if "/c/" in href or "/article/" in href:
|
||||
urls.append(href)
|
||||
|
||||
elif website_name == "科技日报":
|
||||
# 科技日报搜索结果结构
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
if "/article/" in href or "/content/" in href:
|
||||
urls.append(href)
|
||||
|
||||
elif website_name == "人民政协网":
|
||||
# 人民政协网搜索结果结构
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
if "/article/" in href or "/content/" in href:
|
||||
urls.append(href)
|
||||
|
||||
elif website_name == "中国纪检监察报":
|
||||
# 中国纪检监察报搜索结果结构
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
if "/article/" in href or "/content/" in href:
|
||||
urls.append(href)
|
||||
|
||||
elif website_name == "中国新闻社":
|
||||
# 中国新闻社搜索结果结构
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
if "/article/" in href or "/content/" in href:
|
||||
urls.append(href)
|
||||
|
||||
elif website_name == "学习时报":
|
||||
# 学习时报搜索结果结构
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
if "/article/" in href or "/content/" in href:
|
||||
urls.append(href)
|
||||
|
||||
elif website_name == "中国青年报":
|
||||
# 中国青年报搜索结果结构
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
if "/article/" in href or "/content/" in href:
|
||||
urls.append(href)
|
||||
|
||||
elif website_name == "中国妇女报":
|
||||
# 中国妇女报搜索结果结构
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
if "/article/" in href or "/content/" in href:
|
||||
urls.append(href)
|
||||
|
||||
elif website_name == "法治日报":
|
||||
# 法治日报搜索结果结构
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
if "/content/" in href and "content_" in href:
|
||||
urls.append(href)
|
||||
|
||||
elif website_name == "农民日报":
|
||||
# 农民日报搜索结果结构
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
if "/article/" in href or "/content/" in href:
|
||||
urls.append(href)
|
||||
|
||||
elif website_name == "学习强国":
|
||||
# 学习强国搜索结果结构
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
if "/article/" in href or "/content/" in href:
|
||||
urls.append(href)
|
||||
|
||||
elif website_name == "旗帜网":
|
||||
# 旗帜网搜索结果结构
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
if "/n1/" in href or "/article/" in href:
|
||||
urls.append(href)
|
||||
|
||||
elif website_name == "中国网":
|
||||
# 中国网搜索结果结构
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
if "/article/" in href or "/opinion/" in href:
|
||||
urls.append(href)
|
||||
|
||||
elif website_name == "中国政府网":
|
||||
# 中国政府网搜索结果结构
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
if "/zhengce/" in href or "/xinwen/" in href:
|
||||
urls.append(href)
|
||||
|
||||
elif website_name == "求是网":
|
||||
# 求是网搜索结果结构
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
if "/article/" in href or "/content/" in href:
|
||||
urls.append(href)
|
||||
|
||||
elif website_name == "解放军报":
|
||||
# 解放军报搜索结果结构
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
if "/zt/" in href or "/article/" in href:
|
||||
urls.append(href)
|
||||
|
||||
# 去重并返回
|
||||
return list(set(urls))
|
||||
|
||||
|
||||
def crawl_by_keyword(keyword, website_names=None, max_pages=10, start_date=None, end_date=None, max_articles=100):
|
||||
"""
|
||||
根据关键词爬取多个网站的文章
|
||||
|
||||
Args:
|
||||
keyword: 搜索关键词
|
||||
website_names: 网站名称列表,如果为None则爬取所有支持的网站
|
||||
max_pages: 每个网站最大搜索页数
|
||||
start_date: 开始日期 (YYYY-MM-DD)
|
||||
end_date: 结束日期 (YYYY-MM-DD)
|
||||
max_articles: 最大文章数量
|
||||
|
||||
Returns:
|
||||
dict: 爬取结果统计
|
||||
"""
|
||||
if website_names is None:
|
||||
website_names = list(WEBSITE_SEARCH_CONFIGS.keys())
|
||||
|
||||
results = {
|
||||
"keyword": keyword,
|
||||
"total_articles": 0,
|
||||
"success_count": 0,
|
||||
"failed_count": 0,
|
||||
"website_results": {}
|
||||
}
|
||||
|
||||
print(f"开始根据关键词 '{keyword}' 爬取文章...")
|
||||
print(f"目标网站: {', '.join(website_names)}")
|
||||
|
||||
for website_name in website_names:
|
||||
print(f"\n开始爬取 {website_name}...")
|
||||
|
||||
try:
|
||||
# 获取或创建网站对象
|
||||
from core.models import Website
|
||||
website, created = Website.objects.get_or_create(
|
||||
name=website_name,
|
||||
defaults={
|
||||
'base_url': WEBSITE_SEARCH_CONFIGS[website_name]["search_url"],
|
||||
'enabled': True
|
||||
}
|
||||
)
|
||||
|
||||
# 搜索文章URL
|
||||
article_urls = search_articles_by_keyword(
|
||||
website_name, keyword, max_pages, start_date, end_date
|
||||
)
|
||||
|
||||
if not article_urls:
|
||||
print(f"{website_name} 没有找到相关文章")
|
||||
results["website_results"][website_name] = {
|
||||
"found_urls": 0,
|
||||
"processed": 0,
|
||||
"success": 0,
|
||||
"failed": 0
|
||||
}
|
||||
continue
|
||||
|
||||
# 限制文章数量
|
||||
if len(article_urls) > max_articles:
|
||||
article_urls = article_urls[:max_articles]
|
||||
|
||||
print(f"{website_name} 找到 {len(article_urls)} 篇文章,开始处理...")
|
||||
|
||||
website_success = 0
|
||||
website_failed = 0
|
||||
|
||||
for i, url in enumerate(article_urls, 1):
|
||||
try:
|
||||
print(f"处理第 {i}/{len(article_urls)} 篇: {url}")
|
||||
process_article(url, website)
|
||||
website_success += 1
|
||||
results["success_count"] += 1
|
||||
|
||||
# 避免请求过快
|
||||
time.sleep(0.5)
|
||||
|
||||
except Exception as e:
|
||||
print(f"处理文章失败: {url}, 错误: {e}")
|
||||
website_failed += 1
|
||||
results["failed_count"] += 1
|
||||
|
||||
results["website_results"][website_name] = {
|
||||
"found_urls": len(article_urls),
|
||||
"processed": len(article_urls),
|
||||
"success": website_success,
|
||||
"failed": website_failed
|
||||
}
|
||||
|
||||
print(f"{website_name} 完成: 成功 {website_success}, 失败 {website_failed}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"爬取 {website_name} 时出错: {e}")
|
||||
results["website_results"][website_name] = {
|
||||
"found_urls": 0,
|
||||
"processed": 0,
|
||||
"success": 0,
|
||||
"failed": 1,
|
||||
"error": str(e)
|
||||
}
|
||||
results["failed_count"] += 1
|
||||
|
||||
results["total_articles"] = results["success_count"] + results["failed_count"]
|
||||
|
||||
print(f"\n爬取完成!")
|
||||
print(f"关键词: {keyword}")
|
||||
print(f"总文章数: {results['total_articles']}")
|
||||
print(f"成功: {results['success_count']}")
|
||||
print(f"失败: {results['failed_count']}")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def crawl_historical_articles(website_names=None, start_date=None, end_date=None, max_articles_per_site=50):
|
||||
"""
|
||||
爬取历史文章
|
||||
|
||||
Args:
|
||||
website_names: 网站名称列表
|
||||
start_date: 开始日期 (YYYY-MM-DD)
|
||||
end_date: 结束日期 (YYYY-MM-DD)
|
||||
max_articles_per_site: 每个网站最大文章数
|
||||
|
||||
Returns:
|
||||
dict: 爬取结果统计
|
||||
"""
|
||||
if not start_date:
|
||||
start_date = (datetime.now() - timedelta(days=30)).strftime("%Y-%m-%d")
|
||||
if not end_date:
|
||||
end_date = datetime.now().strftime("%Y-%m-%d")
|
||||
|
||||
if website_names is None:
|
||||
website_names = list(WEBSITE_SEARCH_CONFIGS.keys())
|
||||
|
||||
results = {
|
||||
"start_date": start_date,
|
||||
"end_date": end_date,
|
||||
"total_articles": 0,
|
||||
"success_count": 0,
|
||||
"failed_count": 0,
|
||||
"website_results": {}
|
||||
}
|
||||
|
||||
print(f"开始爬取历史文章...")
|
||||
print(f"日期范围: {start_date} 到 {end_date}")
|
||||
print(f"目标网站: {', '.join(website_names)}")
|
||||
|
||||
# 使用通用关键词搜索历史文章
|
||||
common_keywords = ["新闻", "报道", "文章", "资讯", "动态"]
|
||||
|
||||
for website_name in website_names:
|
||||
print(f"\n开始爬取 {website_name} 历史文章...")
|
||||
|
||||
try:
|
||||
from core.models import Website
|
||||
website, created = Website.objects.get_or_create(
|
||||
name=website_name,
|
||||
defaults={
|
||||
'base_url': WEBSITE_SEARCH_CONFIGS[website_name]["search_url"],
|
||||
'enabled': True
|
||||
}
|
||||
)
|
||||
|
||||
website_success = 0
|
||||
website_failed = 0
|
||||
all_urls = set()
|
||||
|
||||
# 使用多个关键词搜索
|
||||
for keyword in common_keywords:
|
||||
try:
|
||||
article_urls = search_articles_by_keyword(
|
||||
website_name, keyword, max_pages=5,
|
||||
start_date=start_date, end_date=end_date
|
||||
)
|
||||
all_urls.update(article_urls)
|
||||
|
||||
if len(all_urls) >= max_articles_per_site:
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
print(f"搜索关键词 '{keyword}' 时出错: {e}")
|
||||
continue
|
||||
|
||||
# 限制文章数量
|
||||
article_urls = list(all_urls)[:max_articles_per_site]
|
||||
|
||||
if not article_urls:
|
||||
print(f"{website_name} 没有找到历史文章")
|
||||
results["website_results"][website_name] = {
|
||||
"found_urls": 0,
|
||||
"processed": 0,
|
||||
"success": 0,
|
||||
"failed": 0
|
||||
}
|
||||
continue
|
||||
|
||||
print(f"{website_name} 找到 {len(article_urls)} 篇历史文章,开始处理...")
|
||||
|
||||
for i, url in enumerate(article_urls, 1):
|
||||
try:
|
||||
print(f"处理第 {i}/{len(article_urls)} 篇: {url}")
|
||||
process_article(url, website)
|
||||
website_success += 1
|
||||
results["success_count"] += 1
|
||||
|
||||
time.sleep(0.5)
|
||||
|
||||
except Exception as e:
|
||||
print(f"处理文章失败: {url}, 错误: {e}")
|
||||
website_failed += 1
|
||||
results["failed_count"] += 1
|
||||
|
||||
results["website_results"][website_name] = {
|
||||
"found_urls": len(article_urls),
|
||||
"processed": len(article_urls),
|
||||
"success": website_success,
|
||||
"failed": website_failed
|
||||
}
|
||||
|
||||
print(f"{website_name} 完成: 成功 {website_success}, 失败 {website_failed}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"爬取 {website_name} 历史文章时出错: {e}")
|
||||
results["website_results"][website_name] = {
|
||||
"found_urls": 0,
|
||||
"processed": 0,
|
||||
"success": 0,
|
||||
"failed": 1,
|
||||
"error": str(e)
|
||||
}
|
||||
results["failed_count"] += 1
|
||||
|
||||
results["total_articles"] = results["success_count"] + results["failed_count"]
|
||||
|
||||
print(f"\n历史文章爬取完成!")
|
||||
print(f"日期范围: {start_date} 到 {end_date}")
|
||||
print(f"总文章数: {results['total_articles']}")
|
||||
print(f"成功: {results['success_count']}")
|
||||
print(f"失败: {results['failed_count']}")
|
||||
|
||||
return results
|
||||
|
||||
Reference in New Issue
Block a user