Compare commits

..

45 Commits

Author SHA1 Message Date
a36d730384 support keyword crawl 2025-09-26 10:39:36 +08:00
499454ff27 fix bug 2025-09-24 04:02:43 +08:00
bf927dc77c fix bug 2025-09-24 03:56:14 +08:00
81a17132e2 fix bug 2025-09-24 03:52:55 +08:00
8592833d74 Support keword 2025-09-24 03:38:32 +08:00
a4891b1c30 Support first case: 1. Add filters in website; 2. Add export all file in admin 2025-09-12 03:37:26 +08:00
922a88048b Add Celery 2025-08-17 03:25:43 +08:00
100a0cd042 Add log 2025-08-17 03:20:08 +08:00
514197d5b3 add dockerfile 2025-08-17 02:52:45 +08:00
31fe69535c Support docker 2025-08-17 02:46:54 +08:00
1b947158a9 Support more crawler 2025-08-17 02:20:51 +08:00
46f9ff87f1 Support celery 2025-08-17 02:19:40 +08:00
193894fcb4 deploy test 2025-08-17 02:12:25 +08:00
4945b4c6b0 fix bugs and support all platform 2025-08-15 08:33:47 +08:00
e82b85f4dd fix all bug 2025-08-15 05:58:16 +08:00
c4dfc515f7 fix some bug 2025-08-15 05:40:08 +08:00
8db0512a6a Remove the other 2025-08-15 04:07:08 +08:00
d3760c5780 Remove mobile platform 2025-08-15 04:02:32 +08:00
490cc835d2 fix qiushi bug 2025-08-15 04:02:03 +08:00
99660f4218 Remove mobile platform 2025-08-15 03:48:23 +08:00
b9c31a4da1 Remove mobile platform 2025-08-15 03:09:53 +08:00
2fe9e40840 Unsupport 2025-08-15 03:09:21 +08:00
0aff839ed2 Remove mobile platform 2025-08-15 03:07:04 +08:00
8405bd2402 Remove mobile platform 2025-08-15 03:05:16 +08:00
651964ebfc Remove mobile platform 2025-08-15 02:54:45 +08:00
e71e7e7eb3 Unsupport gmrb Because not supporot catch 2025-08-15 02:54:11 +08:00
7e6325c68e Remove mobile platform 2025-08-15 02:51:46 +08:00
d64bf93988 Fix fzrb bug : add support catch fzrb 2025-08-15 02:38:14 +08:00
83d1b21686 Remove mobile platform 2025-08-15 02:29:32 +08:00
7b16c384d3 Change cngov && dongfangyancao setup 2025-08-15 02:23:46 +08:00
e04a611dbc Remove mobile platform 2025-08-15 02:04:30 +08:00
1856f3e9fc Fix chinadaily bug : Support more packages 2025-08-15 02:03:13 +08:00
89909d2781 Support CCTV Plamforms 2025-08-15 01:08:53 +08:00
ac98ac0057 Add Support All Platform 2025-08-14 23:42:16 +08:00
4994310f14 Add Support the other website 2025-08-14 14:24:18 +08:00
31d0525cd0 fix bugs 2025-08-13 21:35:11 +08:00
c618528a0a Unknow change 2025-08-13 18:40:31 +08:00
5e396796ca Add export into front 2025-08-13 00:26:39 +08:00
baea50bfa0 Init code 2025-08-12 17:14:10 +08:00
130999364f Add Support CNGOV 2025-08-12 17:13:24 +08:00
958b087f54 Add Search button 2025-08-11 23:42:14 +08:00
b6bbb90703 Support export for Word 2025-08-11 23:14:56 +08:00
bfd1604872 Add packages 2025-08-11 22:55:57 +08:00
d9d2ea9d99 Add Support dongfangyaocao 2025-08-11 22:20:19 +08:00
6d80326a4e Add Support full site 2025-08-11 14:33:32 +08:00
64 changed files with 13168 additions and 136 deletions

6
.gitignore vendored
View File

@@ -180,5 +180,11 @@ cython_debug/
#
#####################################
# 数据目录
data/
date/media/
# 配置文件
config/
.env

73
Dockerfile Normal file
View File

@@ -0,0 +1,73 @@
# 使用Python 3.12官方镜像
FROM python:3.12-slim
# 设置环境变量
ENV PYTHONDONTWRITEBYTECODE=1
ENV PYTHONUNBUFFERED=1
ENV DJANGO_SETTINGS_MODULE=green_classroom.settings
# 设置工作目录
WORKDIR /app
# 安装系统依赖
RUN apt-get update && apt-get install -y \
gcc \
g++ \
libpq-dev \
curl \
wget \
gnupg \
unzip \
&& rm -rf /var/lib/apt/lists/*
# 安装Chrome和ChromeDriver
RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
&& echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list \
&& apt-get update \
&& apt-get install -y google-chrome-stable \
&& rm -rf /var/lib/apt/lists/*
# 下载ChromeDriver
RUN CHROME_VERSION=$(google-chrome --version | awk '{print $3}' | awk -F'.' '{print $1}') \
&& wget -q "https://chromedriver.storage.googleapis.com/LATEST_RELEASE_${CHROME_VERSION}" -O /tmp/chromedriver_version \
&& CHROMEDRIVER_VERSION=$(cat /tmp/chromedriver_version) \
&& wget -q "https://chromedriver.storage.googleapis.com/${CHROMEDRIVER_VERSION}/chromedriver_linux64.zip" -O /tmp/chromedriver.zip \
&& unzip /tmp/chromedriver.zip -d /usr/local/bin/ \
&& rm /tmp/chromedriver.zip /tmp/chromedriver_version \
&& chmod +x /usr/local/bin/chromedriver
# 复制requirements.txt
COPY requirements.txt .
# 安装Python依赖
RUN pip install --no-cache-dir -r requirements.txt
# 复制项目文件
COPY . .
# 创建必要的目录
RUN mkdir -p /app/data/logs /app/data/static /app/data/media
# 收集静态文件
RUN python manage.py collectstatic --noinput
# 暴露端口
EXPOSE 8000
# 创建启动脚本
RUN echo '#!/bin/bash\n\
if [ "$1" = "celery" ]; then\n\
exec celery -A green_classroom worker --loglevel=info\n\
elif [ "$1" = "celery-beat" ]; then\n\
exec celery -A green_classroom beat --loglevel=info\n\
elif [ "$1" = "flower" ]; then\n\
exec celery -A green_classroom flower\n\
elif [ "$1" = "gunicorn" ]; then\n\
exec gunicorn green_classroom.asgi:application -b 0.0.0.0:8000 --worker-class uvicorn.workers.UvicornWorker --workers 2\n\
else\n\
exec python manage.py runserver 0.0.0.0:8000\n\
fi' > /app/entrypoint.sh && chmod +x /app/entrypoint.sh
# 设置入口点
ENTRYPOINT ["/app/entrypoint.sh"]
CMD ["runserver"]

View File

@@ -1,11 +0,0 @@
from django.contrib import admin
from .models import Website, Article
@admin.register(Website)
class WebsiteAdmin(admin.ModelAdmin):
list_display = ('name', 'base_url', 'enabled')
@admin.register(Article)
class ArticleAdmin(admin.ModelAdmin):
list_display = ('title', 'website', 'pub_date')
search_fields = ('title', 'content')

947
core/admin_extended.py Normal file
View File

@@ -0,0 +1,947 @@
"""
Django Admin扩展
提供增强的管理界面功能
"""
import logging
from datetime import datetime, timedelta
from django.contrib import admin
from django.contrib.admin import SimpleListFilter
from django.contrib.admin.utils import model_format_dict
from django.contrib import messages
from django.http import HttpResponseRedirect
from django.urls import path, reverse
from django.utils.html import format_html
from django.utils import timezone
from django.db.models import Count, Q
from django.core.cache import cache
from .models import Website, Article, CrawlTask
from .tasks import crawl_website, crawl_all_websites, cleanup_old_articles
from .distributed_crawler import distributed_crawler
from .task_executor import task_executor
logger = logging.getLogger(__name__)
class WebsiteStatusFilter(SimpleListFilter):
"""网站状态过滤器"""
title = '网站状态'
parameter_name = 'status'
def lookups(self, request, model_admin):
return (
('enabled', '已启用'),
('disabled', '已禁用'),
('no_articles', '无文章'),
('recent_crawl', '最近爬取'),
)
def queryset(self, request, queryset):
if self.value() == 'enabled':
return queryset.filter(enabled=True)
elif self.value() == 'disabled':
return queryset.filter(enabled=False)
elif self.value() == 'no_articles':
return queryset.annotate(article_count=Count('article')).filter(article_count=0)
elif self.value() == 'recent_crawl':
week_ago = timezone.now() - timedelta(days=7)
return queryset.filter(last_crawl__gte=week_ago)
return queryset
class ArticleDateFilter(SimpleListFilter):
"""文章日期过滤器"""
title = '发布时间'
parameter_name = 'date_range'
def lookups(self, request, model_admin):
return (
('today', '今天'),
('week', '本周'),
('month', '本月'),
('quarter', '本季度'),
)
def queryset(self, request, queryset):
now = timezone.now()
if self.value() == 'today':
return queryset.filter(created_at__date=now.date())
elif self.value() == 'week':
week_start = now - timedelta(days=now.weekday())
return queryset.filter(created_at__gte=week_start.replace(hour=0, minute=0, second=0))
elif self.value() == 'month':
return queryset.filter(created_at__year=now.year, created_at__month=now.month)
elif self.value() == 'quarter':
quarter = (now.month - 1) // 3
quarter_start_month = quarter * 3 + 1
return queryset.filter(
created_at__year=now.year,
created_at__month__gte=quarter_start_month,
created_at__month__lt=quarter_start_month + 3
)
return queryset
class WebsiteAdmin(admin.ModelAdmin):
"""网站管理"""
list_display = [
'name', 'base_url', 'enabled', 'article_count',
'last_crawl_display', 'status_indicator', 'actions_column'
]
list_filter = [WebsiteStatusFilter, 'enabled']
search_fields = ['name', 'base_url']
readonly_fields = ['article_count']
actions = ['enable_websites', 'disable_websites', 'crawl_selected', 'crawl_all']
fieldsets = (
('基本信息', {
'fields': ('name', 'base_url', 'enabled')
}),
('统计信息', {
'fields': ('article_count',),
'classes': ('collapse',)
}),
('时间信息', {
'fields': (),
'classes': ('collapse',)
}),
)
# 添加get_websites方法以支持模板中的网站选择
def get_websites(self, request):
"""获取所有启用的网站,用于模板中的选择框"""
return Website.objects.filter(enabled=True)
def article_count(self, obj):
"""文章数量"""
return obj.article_set.count()
article_count.short_description = '文章数量'
def last_crawl_display(self, obj):
"""最后爬取时间显示"""
return '未实现'
last_crawl_display.short_description = '最后爬取'
def status_indicator(self, obj):
"""状态指示器"""
if obj.enabled:
return format_html('<span style="color: green;">●</span> 正常')
else:
return format_html('<span style="color: red;">●</span> 禁用')
status_indicator.short_description = '状态'
def actions_column(self, obj):
"""操作列"""
return format_html(
'<a href="{}" class="button">爬取</a> '
'<a href="{}" class="button">查看文章</a>',
reverse('admin:crawl_website', args=[obj.id]),
reverse('admin:core_article_changelist') + f'?website__id__exact={obj.id}'
)
actions_column.short_description = '操作'
def enable_websites(self, request, queryset):
"""启用选中的网站"""
updated = queryset.update(enabled=True)
self.message_user(request, f'成功启用 {updated} 个网站')
enable_websites.short_description = '启用选中的网站'
def disable_websites(self, request, queryset):
"""禁用选中的网站"""
updated = queryset.update(enabled=False)
self.message_user(request, f'成功禁用 {updated} 个网站')
disable_websites.short_description = '禁用选中的网站'
def crawl_selected(self, request, queryset):
"""爬取选中的网站"""
for website in queryset:
try:
task = crawl_website.delay(website.id)
self.message_user(
request,
f'网站 {website.name} 爬取任务已启动 (任务ID: {task.id})',
messages.SUCCESS
)
except Exception as e:
error_msg = str(e)
if "[Errno 61] Connection refused" in error_msg:
detailed_msg = "连接被拒绝可能是Redis或其他依赖服务未启动。请检查以下几点\n1. Redis服务是否运行 (尝试运行: redis-server)\n2. 如果使用Docker请确保容器正在运行\n3. 检查Django配置中的CELERY_BROKER_URL设置\n4. 在本地开发环境中,可以运行 'python manage.py runserver''celery -A myproject worker -l info' 来启动必要的服务"
else:
detailed_msg = error_msg
self.message_user(
request,
f'网站 {website.name} 爬取任务启动失败: {detailed_msg}',
messages.ERROR
)
crawl_selected.short_description = '爬取选中的网站'
def crawl_all(self, request, queryset):
try:
task = crawl_all_websites.delay()
self.message_user(
request,
f'批量爬取任务已启动 (任务ID: {task.id})',
messages.SUCCESS
)
except Exception as e:
error_msg = str(e)
if "[Errno 61] Connection refused" in error_msg:
detailed_msg = "连接被拒绝可能是Redis或其他依赖服务未启动。请检查以下几点\n1. Redis服务是否运行 (尝试运行: redis-server)\n2. 如果使用Docker请确保容器正在运行\n3. 检查Django配置中的CELERY_BROKER_URL设置\n4. 在本地开发环境中,可以运行 'python manage.py runserver''celery -A myproject worker -l info' 来启动必要的服务"
else:
detailed_msg = error_msg
self.message_user(
request,
f'批量爬取任务启动失败: {detailed_msg}',
messages.ERROR
)
# crawl_all.short_description = '爬取所有网站'
def get_urls(self):
"""添加自定义URL"""
urls = super().get_urls()
custom_urls = [
path(
'<int:website_id>/crawl/',
self.admin_site.admin_view(self.crawl_website_view),
name='crawl_website',
),
path(
'run-crawler/',
self.admin_site.admin_view(self.run_crawler_view),
name='run_crawler',
),
]
return custom_urls + urls
def crawl_website_view(self, request, website_id):
"""爬取单个网站视图"""
try:
website = Website.objects.get(id=website_id)
task = crawl_website.delay(website_id)
self.message_user(
request,
f'网站 {website.name} 爬取任务已启动 (任务ID: {task.id})',
messages.SUCCESS
)
except Website.DoesNotExist:
self.message_user(request, '网站不存在', messages.ERROR)
except Exception as e:
error_msg = str(e)
if "[Errno 61] Connection refused" in error_msg:
detailed_msg = "连接被拒绝可能是Redis或其他依赖服务未启动。请检查以下几点\n1. Redis服务是否运行 (尝试运行: redis-server)\n2. 如果使用Docker请确保容器正在运行\n3. 检查Django配置中的CELERY_BROKER_URL设置\n4. 在本地开发环境中,可以运行 'python manage.py runserver''celery -A myproject worker -l info' 来启动必要的服务"
else:
detailed_msg = error_msg
self.message_user(request, f'爬取任务启动失败: {detailed_msg}', messages.ERROR)
return HttpResponseRedirect(reverse('admin:core_website_changelist'))
def run_crawler_view(self, request):
"""运行爬虫视图"""
try:
task = crawl_all_websites.delay()
self.message_user(
request,
f'批量爬取任务已启动 (任务ID: {task.id})',
messages.SUCCESS
)
except Exception as e:
error_msg = str(e)
if "[Errno 61] Connection refused" in error_msg:
detailed_msg = "连接被拒绝可能是Redis或其他依赖服务未启动。请检查以下几点\n1. Redis服务是否运行 (尝试运行: redis-server)\n2. 如果使用Docker请确保容器正在运行\n3. 检查Django配置中的CELERY_BROKER_URL设置\n4. 在本地开发环境中,可以运行 'python manage.py runserver''celery -A myproject worker -l info' 来启动必要的服务"
else:
detailed_msg = error_msg
self.message_user(
request,
f'批量爬取任务启动失败: {detailed_msg}',
messages.ERROR
)
return HttpResponseRedirect(reverse('admin:core_website_changelist'))
class ArticleAdmin(admin.ModelAdmin):
"""文章管理"""
list_display = [
'title', 'website', 'created_at',
'media_count', 'actions_column'
]
list_filter = [
ArticleDateFilter, 'website', 'created_at'
]
search_fields = ['title', 'content', 'url']
readonly_fields = ['created_at', 'media_files_display']
date_hierarchy = 'created_at'
fieldsets = (
('基本信息', {
'fields': ('title', 'url', 'website')
}),
('内容', {
'fields': ('content',)
}),
('媒体文件', {
'fields': ('media_files_display',),
'classes': ('collapse',)
}),
('时间信息', {
'fields': ('created_at',),
'classes': ('collapse',)
}),
)
# 添加导出选中文章的操作
actions = ['export_selected_articles']
def export_selected_articles(self, request, queryset):
"""
导出选中的文章为ZIP文件
"""
import zipfile
from django.http import HttpResponse
from io import BytesIO
from django.conf import settings
import os
from bs4 import BeautifulSoup
from docx import Document
# 创建内存中的ZIP文件
zip_buffer = BytesIO()
with zipfile.ZipFile(zip_buffer, 'w') as zip_file:
# 为每篇文章创建文件夹并添加内容
for article in queryset:
# 创建文章文件夹名称
article_folder = f"article_{article.id}_{article.title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('*', '_').replace('?', '_').replace('"', '_').replace('<', '_').replace('>', '_').replace('|', '_')}"
# 创建Word文档
doc = Document()
doc.add_heading(article.title, 0)
# 添加文章信息
doc.add_paragraph(f"网站: {article.website.name if article.website else ''}")
doc.add_paragraph(f"URL: {article.url}")
doc.add_paragraph(f"发布时间: {article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else ''}")
doc.add_paragraph(f"创建时间: {article.created_at.strftime('%Y-%m-%d %H:%M:%S') if article.created_at else ''}")
# 添加内容标题
doc.add_heading('内容:', level=1)
# 处理HTML内容
soup = BeautifulSoup(article.content, 'html.parser')
content_text = soup.get_text()
doc.add_paragraph(content_text)
# 将Word文档保存到内存中
doc_buffer = BytesIO()
doc.save(doc_buffer)
doc_buffer.seek(0)
# 将Word文档添加到ZIP文件
zip_file.writestr(os.path.join(article_folder, f'{article.title.replace("/", "_")}.docx'), doc_buffer.getvalue())
# 添加媒体文件到ZIP包
if article.media_files:
for media_file in article.media_files:
try:
full_path = os.path.join(settings.MEDIA_ROOT, media_file)
if os.path.exists(full_path):
# 添加文件到ZIP包
zip_file.write(full_path, os.path.join(article_folder, 'media', os.path.basename(media_file)))
except Exception as e:
# 如果添加媒体文件失败,继续处理其他文件
pass
# 创建HttpResponse
zip_buffer.seek(0)
response = HttpResponse(zip_buffer.getvalue(), content_type='application/zip')
response['Content-Disposition'] = 'attachment; filename=selected_articles.zip'
return response
export_selected_articles.short_description = "导出所选的文章为ZIP"
def content_preview(self, obj):
"""内容预览"""
return obj.content[:100] + '...' if len(obj.content) > 100 else obj.content
content_preview.short_description = '内容预览'
def media_count(self, obj):
"""媒体文件数量"""
if obj.media_files:
return len(obj.media_files)
return 0
media_count.short_description = '媒体文件'
def media_files_display(self, obj):
"""媒体文件显示"""
if not obj.media_files:
return '无媒体文件'
html = '<div style="max-height: 300px; overflow-y: auto;">'
for i, media in enumerate(obj.media_files):
if media.get('type') == 'image':
html += f'<div style="margin: 10px 0;"><img src="{media["url"]}" style="max-width: 200px; max-height: 150px;" /></div>'
elif media.get('type') == 'video':
html += f'<div style="margin: 10px 0;"><video controls style="max-width: 200px;"><source src="{media["url"]}" type="video/mp4"></video></div>'
html += '</div>'
return format_html(html)
media_files_display.short_description = '媒体文件'
def actions_column(self, obj):
"""操作列"""
# 修改: 添加跳转到本地文章详情页的链接
return format_html(
'<a href="{}" target="_blank" class="button">查看原文</a> '
'<a href="{}" target="_blank" class="button">本地查看</a>',
obj.url,
reverse('article_detail', args=[obj.id])
)
actions_column.short_description = '操作'
class CrawlTaskStatusFilter(SimpleListFilter):
"""爬取任务状态过滤器"""
title = '任务状态'
parameter_name = 'status'
def lookups(self, request, model_admin):
return (
('pending', '等待中'),
('running', '运行中'),
('completed', '已完成'),
('failed', '失败'),
('cancelled', '已取消'),
)
def queryset(self, request, queryset):
if self.value():
return queryset.filter(status=self.value())
return queryset
class CrawlTaskTypeFilter(SimpleListFilter):
"""爬取任务类型过滤器"""
title = '任务类型'
parameter_name = 'task_type'
def lookups(self, request, model_admin):
return (
('keyword', '关键词搜索'),
('historical', '历史文章'),
('full_site', '全站爬取'),
)
def queryset(self, request, queryset):
if self.value():
return queryset.filter(task_type=self.value())
return queryset
class CrawlTaskAdmin(admin.ModelAdmin):
"""爬取任务管理"""
list_display = [
'name', 'task_type', 'keyword', 'websites_display', 'status',
'progress_display', 'created_at', 'duration_display', 'actions_column'
]
list_filter = [CrawlTaskStatusFilter, CrawlTaskTypeFilter, 'created_at']
search_fields = ['name', 'keyword', 'created_by']
readonly_fields = [
'status', 'progress', 'current_website', 'current_action',
'total_articles', 'success_count', 'failed_count',
'created_at', 'started_at', 'completed_at', 'error_message',
'result_details', 'duration_display', 'progress_display',
'execution_count', 'last_execution_at', 'execution_summary'
]
actions = ['start_tasks', 'rerun_tasks', 'cancel_tasks', 'delete_completed_tasks']
class Media:
js = ('admin/js/crawl_task_actions.js',)
fieldsets = (
('基本信息', {
'fields': ('name', 'task_type', 'keyword')
}),
('爬取配置', {
'fields': ('websites', 'start_date', 'end_date', 'max_pages', 'max_articles')
}),
('任务状态', {
'fields': ('status', 'progress_display', 'current_website', 'current_action'),
'classes': ('collapse',)
}),
('统计信息', {
'fields': ('total_articles', 'success_count', 'failed_count'),
'classes': ('collapse',)
}),
('时间信息', {
'fields': ('created_at', 'started_at', 'completed_at', 'duration_display'),
'classes': ('collapse',)
}),
('执行历史', {
'fields': ('execution_count', 'last_execution_at', 'execution_summary'),
'classes': ('collapse',)
}),
('错误信息', {
'fields': ('error_message',),
'classes': ('collapse',)
}),
('结果详情', {
'fields': ('result_details',),
'classes': ('collapse',)
}),
)
def websites_display(self, obj):
"""网站列表显示"""
return obj.get_websites_display()
websites_display.short_description = '目标网站'
def progress_display(self, obj):
"""进度显示"""
if obj.status == 'running':
return format_html(
'<div style="width: 100px; background-color: #f0f0f0; border-radius: 3px;">'
'<div style="width: {}%; background-color: #4CAF50; height: 20px; border-radius: 3px; text-align: center; color: white; line-height: 20px;">{}%</div>'
'</div>',
obj.progress, obj.progress
)
elif obj.status == 'completed':
return format_html('<span style="color: green;">✓ 完成</span>')
elif obj.status == 'failed':
return format_html('<span style="color: red;">✗ 失败</span>')
elif obj.status == 'cancelled':
return format_html('<span style="color: orange;">⊘ 已取消</span>')
else:
return format_html('<span style="color: gray;">⏳ 等待</span>')
progress_display.short_description = '进度'
def duration_display(self, obj):
"""执行时长显示"""
duration = obj.get_duration()
if duration:
total_seconds = int(duration.total_seconds())
hours = total_seconds // 3600
minutes = (total_seconds % 3600) // 60
seconds = total_seconds % 60
if hours > 0:
return f"{hours}小时{minutes}分钟"
elif minutes > 0:
return f"{minutes}分钟{seconds}"
else:
return f"{seconds}"
return "-"
duration_display.short_description = '执行时长'
def execution_summary(self, obj):
"""执行摘要显示"""
return obj.get_execution_summary()
execution_summary.short_description = '执行摘要'
def actions_column(self, obj):
"""操作列"""
actions = []
if obj.status == 'pending':
actions.append(f'<a href="javascript:void(0)" onclick="startTask({obj.id})" class="button">开始</a>')
if obj.can_cancel():
actions.append(f'<a href="javascript:void(0)" onclick="cancelTask({obj.id})" class="button">取消</a>')
if obj.status == 'completed':
actions.append(f'<a href="javascript:void(0)" onclick="viewResults({obj.id})" class="button">查看结果</a>')
actions.append(f'<a href="javascript:void(0)" onclick="rerunTask({obj.id})" class="button" style="background-color: #28a745;">重新执行</a>')
if obj.status in ['failed', 'cancelled']:
actions.append(f'<a href="javascript:void(0)" onclick="rerunTask({obj.id})" class="button" style="background-color: #28a745;">重新执行</a>')
return format_html(' '.join(actions))
actions_column.short_description = '操作'
def start_tasks(self, request, queryset):
"""启动选中的任务"""
started_count = 0
for task in queryset.filter(status='pending'):
try:
success, message = task_executor.start_task(task.id)
if success:
started_count += 1
else:
self.message_user(request, f'启动任务 {task.name} 失败: {message}', messages.ERROR)
except Exception as e:
self.message_user(request, f'启动任务 {task.name} 失败: {e}', messages.ERROR)
if started_count > 0:
self.message_user(request, f'成功启动 {started_count} 个任务', messages.SUCCESS)
start_tasks.short_description = '启动选中的任务'
def rerun_tasks(self, request, queryset):
"""重新执行选中的任务"""
rerun_count = 0
for task in queryset.filter(status__in=['completed', 'failed', 'cancelled']):
try:
success, message = task_executor.rerun_task(task.id)
if success:
rerun_count += 1
else:
self.message_user(request, f'重新执行任务 {task.name} 失败: {message}', messages.ERROR)
except Exception as e:
self.message_user(request, f'重新执行任务 {task.name} 失败: {e}', messages.ERROR)
if rerun_count > 0:
self.message_user(request, f'成功重新执行 {rerun_count} 个任务', messages.SUCCESS)
rerun_tasks.short_description = '重新执行选中的任务'
def cancel_tasks(self, request, queryset):
"""取消选中的任务"""
cancelled_count = 0
for task in queryset.filter(status__in=['pending', 'running']):
try:
success, message = task_executor.cancel_task(task.id)
if success:
cancelled_count += 1
else:
self.message_user(request, f'取消任务 {task.name} 失败: {message}', messages.ERROR)
except Exception as e:
self.message_user(request, f'取消任务 {task.name} 失败: {e}', messages.ERROR)
if cancelled_count > 0:
self.message_user(request, f'成功取消 {cancelled_count} 个任务', messages.SUCCESS)
elif queryset.filter(status__in=['pending', 'running']).count() > 0:
# 有任务但没有成功取消任何任务
self.message_user(request, '没有成功取消任何任务', messages.WARNING)
cancel_tasks.short_description = '取消选中的任务'
def delete_completed_tasks(self, request, queryset):
"""删除已完成的任务"""
completed_tasks = queryset.filter(status__in=['completed', 'failed', 'cancelled'])
count = completed_tasks.count()
completed_tasks.delete()
if count > 0:
self.message_user(request, f'成功删除 {count} 个已完成的任务', messages.SUCCESS)
delete_completed_tasks.short_description = '删除已完成的任务'
def get_urls(self):
"""添加自定义URL"""
urls = super().get_urls()
custom_urls = [
path(
'create-keyword-task/',
self.admin_site.admin_view(self.create_keyword_task_view),
name='create_keyword_task',
),
path(
'create-historical-task/',
self.admin_site.admin_view(self.create_historical_task_view),
name='create_historical_task',
),
path(
'create-full-site-task/',
self.admin_site.admin_view(self.create_full_site_task_view),
name='create_full_site_task',
),
path(
'<int:task_id>/start/',
self.admin_site.admin_view(self.start_task_view),
name='start_task',
),
path(
'<int:task_id>/cancel/',
self.admin_site.admin_view(self.cancel_task_view),
name='cancel_task',
),
path(
'<int:task_id>/rerun/',
self.admin_site.admin_view(self.rerun_task_view),
name='rerun_task',
),
path(
'<int:task_id>/results/',
self.admin_site.admin_view(self.view_results_view),
name='view_results',
),
]
return custom_urls + urls
def create_keyword_task_view(self, request):
"""创建关键词搜索任务视图"""
if request.method == 'POST':
try:
from .utils import WEBSITE_CRAWL_CONFIGS
name = request.POST.get('name', '')
keyword = request.POST.get('keyword', '')
websites = request.POST.getlist('websites')
start_date = request.POST.get('start_date')
end_date = request.POST.get('end_date')
max_pages = int(request.POST.get('max_pages', 10))
max_articles = int(request.POST.get('max_articles', 100))
if not name or not keyword:
self.message_user(request, '任务名称和关键词不能为空', messages.ERROR)
return HttpResponseRedirect(reverse('admin:core_crawltask_changelist'))
# 创建任务
task = CrawlTask.objects.create(
name=name,
task_type='keyword',
keyword=keyword,
start_date=start_date if start_date else None,
end_date=end_date if end_date else None,
max_pages=max_pages,
max_articles=max_articles,
created_by=request.user.username if request.user.is_authenticated else 'admin'
)
# 添加选择的网站
if websites:
website_objects = Website.objects.filter(name__in=websites)
task.websites.set(website_objects)
self.message_user(request, f'关键词搜索任务 "{name}" 创建成功', messages.SUCCESS)
return HttpResponseRedirect(reverse('admin:core_crawltask_change', args=[task.id]))
except Exception as e:
self.message_user(request, f'创建任务失败: {e}', messages.ERROR)
# GET请求显示创建表单
context = {
'websites': Website.objects.filter(enabled=True),
'title': '创建关键词搜索任务'
}
return admin.site.admin_view(self.render_create_task_template)(request, 'admin/create_keyword_task.html', context)
def create_historical_task_view(self, request):
"""创建历史文章任务视图"""
if request.method == 'POST':
try:
from .utils import WEBSITE_CRAWL_CONFIGS
name = request.POST.get('name', '')
websites = request.POST.getlist('websites')
start_date = request.POST.get('start_date')
end_date = request.POST.get('end_date')
max_articles = int(request.POST.get('max_articles', 50))
if not name:
self.message_user(request, '任务名称不能为空', messages.ERROR)
return HttpResponseRedirect(reverse('admin:core_crawltask_changelist'))
# 创建任务
task = CrawlTask.objects.create(
name=name,
task_type='historical',
keyword='历史文章',
start_date=start_date if start_date else None,
end_date=end_date if end_date else None,
max_articles=max_articles,
created_by=request.user.username if request.user.is_authenticated else 'admin'
)
# 添加选择的网站
if websites:
website_objects = Website.objects.filter(name__in=websites)
task.websites.set(website_objects)
self.message_user(request, f'历史文章任务 "{name}" 创建成功', messages.SUCCESS)
return HttpResponseRedirect(reverse('admin:core_crawltask_change', args=[task.id]))
except Exception as e:
self.message_user(request, f'创建任务失败: {e}', messages.ERROR)
# GET请求显示创建表单
context = {
'websites': Website.objects.filter(enabled=True),
'title': '创建历史文章任务'
}
return admin.site.admin_view(self.render_create_task_template)(request, 'admin/create_historical_task.html', context)
def create_full_site_task_view(self, request):
"""创建全站爬取任务视图"""
if request.method == 'POST':
try:
from .utils import WEBSITE_CRAWL_CONFIGS
name = request.POST.get('name', '')
websites = request.POST.getlist('websites')
max_pages = int(request.POST.get('max_pages', 500))
if not name:
self.message_user(request, '任务名称不能为空', messages.ERROR)
return HttpResponseRedirect(reverse('admin:core_crawltask_changelist'))
# 创建任务
task = CrawlTask.objects.create(
name=name,
task_type='full_site',
keyword='全站爬取',
max_pages=max_pages,
created_by=request.user.username if request.user.is_authenticated else 'admin'
)
# 添加选择的网站
if websites:
website_objects = Website.objects.filter(name__in=websites)
task.websites.set(website_objects)
self.message_user(request, f'全站爬取任务 "{name}" 创建成功', messages.SUCCESS)
return HttpResponseRedirect(reverse('admin:core_crawltask_change', args=[task.id]))
except Exception as e:
self.message_user(request, f'创建任务失败: {e}', messages.ERROR)
# GET请求显示创建表单
context = {
'websites': Website.objects.filter(enabled=True),
'title': '创建全站爬取任务'
}
return admin.site.admin_view(self.render_create_task_template)(request, 'admin/create_full_site_task.html', context)
def start_task_view(self, request, task_id):
"""启动任务视图"""
try:
success, message = task_executor.start_task(task_id)
if success:
self.message_user(request, f'任务已启动: {message}', messages.SUCCESS)
else:
self.message_user(request, f'启动任务失败: {message}', messages.ERROR)
except Exception as e:
self.message_user(request, f'启动任务失败: {e}', messages.ERROR)
return HttpResponseRedirect(reverse('admin:core_crawltask_changelist'))
def rerun_task_view(self, request, task_id):
"""重新执行任务视图"""
try:
success, message = task_executor.rerun_task(task_id)
if success:
self.message_user(request, f'任务已重新执行: {message}', messages.SUCCESS)
else:
self.message_user(request, f'重新执行任务失败: {message}', messages.ERROR)
except Exception as e:
self.message_user(request, f'重新执行任务失败: {e}', messages.ERROR)
return HttpResponseRedirect(reverse('admin:core_crawltask_changelist'))
def cancel_task_view(self, request, task_id):
"""取消任务视图"""
try:
success, message = task_executor.cancel_task(task_id)
if success:
self.message_user(request, f'任务已取消: {message}', messages.SUCCESS)
else:
self.message_user(request, f'取消任务失败: {message}', messages.ERROR)
except Exception as e:
self.message_user(request, f'取消任务失败: {e}', messages.ERROR)
return HttpResponseRedirect(reverse('admin:core_crawltask_changelist'))
def view_results_view(self, request, task_id):
"""查看结果视图"""
try:
task = CrawlTask.objects.get(id=task_id)
context = {
'task': task,
'title': f'任务结果 - {task.name}'
}
return admin.site.admin_view(self.render_create_task_template)(request, 'admin/task_results.html', context)
except CrawlTask.DoesNotExist:
self.message_user(request, '任务不存在', messages.ERROR)
return HttpResponseRedirect(reverse('admin:core_crawltask_changelist'))
def render_create_task_template(self, request, template_name, context):
"""渲染创建任务模板"""
from django.template.loader import render_to_string
from django.http import HttpResponse
context.update({
'site_header': admin.site.site_header,
'site_title': admin.site.site_title,
'has_permission': True,
'user': request.user,
})
html = render_to_string(template_name, context)
return HttpResponse(html)
#class CrawlerStatusAdmin(admin.ModelAdmin):
# """爬虫状态管理"""
# change_list_template = 'admin/crawler_status.html'
#
# def changelist_view(self, request, extra_context=None):
# """爬虫状态视图"""
# # 获取分布式爬虫状态
# nodes = distributed_crawler.get_available_nodes()
# node_statuses = []
#
# for node_id in nodes:
# status = distributed_crawler.get_node_status(node_id)
# node_statuses.append(status)
#
# # 获取最近的批次
# batches = distributed_crawler.get_all_batches()[:10]
#
# # 获取任务统计
# task_stats = {
# 'active_tasks': len([n for n in node_statuses if n['active_tasks'] > 0]),
# 'total_nodes': len(nodes),
# 'total_batches': len(batches),
# }
#
# extra_context = extra_context or {}
# extra_context.update({
# 'nodes': node_statuses,
# 'batches': batches,
# 'task_stats': task_stats,
# })
#
# return super().changelist_view(request, extra_context)
#
# 注册管理类
admin.site.register(Website, WebsiteAdmin)
admin.site.register(Article, ArticleAdmin)
admin.site.register(CrawlTask, CrawlTaskAdmin)
# 隐藏Celery Results管理功能
# 禁用django_celery_results应用的自动注册
try:
from django_celery_results.models import TaskResult, GroupResult
from django_celery_results.admin import TaskResultAdmin, GroupResultAdmin
admin.site.unregister(TaskResult)
admin.site.unregister(GroupResult)
except:
pass
# 隐藏Celery Beat周期任务管理功能
# 禁用django_celery_beat应用的自动注册
try:
from django_celery_beat.models import PeriodicTask, ClockedSchedule, CrontabSchedule, SolarSchedule, IntervalSchedule
admin.site.unregister(PeriodicTask)
admin.site.unregister(ClockedSchedule)
admin.site.unregister(CrontabSchedule)
admin.site.unregister(SolarSchedule)
admin.site.unregister(IntervalSchedule)
except:
pass
# 自定义管理站点标题
admin.site.site_header = 'Green Classroom 管理系统'
admin.site.site_title = 'Green Classroom'
admin.site.index_title = '欢迎使用 Green Classroom 管理系统'

746
core/api.py Normal file
View File

@@ -0,0 +1,746 @@
"""
RESTful API模块
提供完整的API接口支持爬虫管理、数据查询、任务控制
"""
import logging
from datetime import datetime, timedelta
from typing import Dict, List, Any
import json
import csv
import io
import zipfile
from django.http import JsonResponse, HttpResponse
from django.views.decorators.csrf import csrf_exempt
from django.views.decorators.http import require_http_methods
from django.core.paginator import Paginator
from django.db.models import Q, Count
from django.utils import timezone
# 添加DRF相关导入
from rest_framework.views import APIView
from rest_framework.response import Response
from rest_framework.permissions import IsAuthenticated
from rest_framework.authentication import SessionAuthentication, TokenAuthentication
# 添加python-docx库支持
from docx import Document
# 添加BeautifulSoup导入
from bs4 import BeautifulSoup
from .models import Website, Article
from .tasks import crawl_website, cleanup_old_articles
from .distributed_crawler import distributed_crawler
logger = logging.getLogger(__name__)
def api_response(data=None, message="", status=200, error=None):
"""统一的API响应格式"""
response = {
"success": status < 400,
"message": message,
"timestamp": datetime.now().isoformat(),
}
if data is not None:
response["data"] = data
if error:
response["error"] = error
# 如果是DRF视图则返回DRF Response
if hasattr(api_response, '_use_drf_response') and api_response._use_drf_response:
return Response(response, status=status)
return JsonResponse(response, status=status)
# 修改健康检查接口为DRF类视图
class HealthView(APIView):
"""健康检查接口"""
permission_classes = [] # 允许无认证访问
authentication_classes = []
def get(self, request):
try:
# 检查数据库连接
website_count = Website.objects.count()
article_count = Article.objects.count()
# 检查Redis连接
from django.core.cache import cache
cache.set('health_check', 'ok', 60)
cache_result = cache.get('health_check')
health_data = {
"status": "healthy",
"database": "ok",
"redis": "ok" if cache_result == 'ok' else 'error',
"website_count": website_count,
"article_count": article_count,
"uptime": "running"
}
# 设置使用DRF响应
api_response._use_drf_response = True
return api_response(data=health_data, message="服务运行正常")
except Exception as e:
logger.error(f"健康检查失败: {e}")
return api_response(
data={"status": "unhealthy", "error": str(e)},
message="服务异常",
status=500,
error=str(e)
)
finally:
api_response._use_drf_response = False
# 修改网站列表接口为DRF类视图
class WebsitesView(APIView):
"""获取网站列表"""
permission_classes = [IsAuthenticated]
authentication_classes = [SessionAuthentication, TokenAuthentication]
def get(self, request):
try:
# 分页参数
page = int(request.GET.get('page', 1))
page_size = int(request.GET.get('page_size', 20))
search = request.GET.get('search', '')
enabled = request.GET.get('enabled', '')
# 构建查询
queryset = Website.objects.all()
if search:
queryset = queryset.filter(
Q(name__icontains=search) |
Q(base_url__icontains=search)
)
if enabled in ['true', 'false']:
queryset = queryset.filter(enabled=enabled == 'true')
# 排序 - 使用id字段替代不存在的created_at字段
queryset = queryset.order_by('-id')
# 分页
paginator = Paginator(queryset, page_size)
websites_page = paginator.get_page(page)
# 统计数据
stats = {
'total_websites': Website.objects.count(),
'enabled_websites': Website.objects.filter(enabled=True).count(),
'disabled_websites': Website.objects.filter(enabled=False).count(),
}
# 序列化数据
websites_data = []
for website in websites_page:
website_data = {
'id': website.id,
'name': website.name,
'base_url': website.base_url,
'enabled': website.enabled,
# 移除不存在的created_at和updated_at字段
'article_count': website.article_set.count(),
'last_crawl': website.last_crawl.isoformat() if getattr(website, 'last_crawl', None) else None,
}
websites_data.append(website_data)
response_data = {
'websites': websites_data,
'pagination': {
'page': page,
'page_size': page_size,
'total_pages': paginator.num_pages,
'total_count': paginator.count,
'has_next': websites_page.has_next(),
'has_previous': websites_page.has_previous(),
},
'stats': stats
}
# 设置使用DRF响应
api_response._use_drf_response = True
return api_response(data=response_data, message="获取网站列表成功")
except Exception as e:
logger.error(f"获取网站列表失败: {e}")
return api_response(message="获取网站列表失败", status=500, error=str(e))
finally:
api_response._use_drf_response = False
@csrf_exempt
@require_http_methods(["GET"])
def api_website_detail(request, website_id):
"""获取网站详情"""
try:
website = Website.objects.get(id=website_id)
# 获取最近的文章
recent_articles = website.article_set.order_by('-created_at')[:10]
website_data = {
'id': website.id,
'name': website.name,
'base_url': website.base_url,
'enabled': website.enabled,
'created_at': website.created_at.isoformat(),
'updated_at': website.updated_at.isoformat(),
'last_crawl': website.last_crawl.isoformat() if website.last_crawl else None,
'article_count': website.article_set.count(),
'recent_articles': [
{
'id': article.id,
'title': article.title,
'url': article.url,
'created_at': article.created_at.isoformat(),
}
for article in recent_articles
]
}
return api_response(data=website_data, message="获取网站详情成功")
except Website.DoesNotExist:
return api_response(message="网站不存在", status=404, error="Website not found")
except Exception as e:
logger.error(f"获取网站详情失败: {e}")
return api_response(message="获取网站详情失败", status=500, error=str(e))
@csrf_exempt
@require_http_methods(["POST"])
def api_crawl_website(request, website_id):
"""爬取指定网站"""
try:
website = Website.objects.get(id=website_id)
# 启动爬虫任务
task = crawl_website.delay(website_id)
response_data = {
'task_id': task.id,
'website_id': website_id,
'website_name': website.name,
'status': 'started'
}
return api_response(data=response_data, message="爬虫任务已启动")
except Website.DoesNotExist:
return api_response(message="网站不存在", status=404, error="Website not found")
except Exception as e:
logger.error(f"启动爬虫任务失败: {e}")
return api_response(message="启动爬虫任务失败", status=500, error=str(e))
@csrf_exempt
@require_http_methods(["GET"])
def api_articles(request):
"""获取文章列表"""
try:
# 分页参数
page = int(request.GET.get('page', 1))
page_size = int(request.GET.get('page_size', 20))
search = request.GET.get('search', '')
website_id = request.GET.get('website_id', '')
date_from = request.GET.get('date_from', '')
date_to = request.GET.get('date_to', '')
# 构建查询
queryset = Article.objects.select_related('website').all()
if search:
queryset = queryset.filter(
Q(title__icontains=search) |
Q(content__icontains=search)
)
if website_id:
queryset = queryset.filter(website_id=website_id)
if date_from:
try:
date_from_obj = datetime.fromisoformat(date_from.replace('Z', '+00:00'))
queryset = queryset.filter(created_at__gte=date_from_obj)
except ValueError:
pass
if date_to:
try:
date_to_obj = datetime.fromisoformat(date_to.replace('Z', '+00:00'))
queryset = queryset.filter(created_at__lte=date_to_obj)
except ValueError:
pass
# 排序
queryset = queryset.order_by('-created_at')
# 分页
paginator = Paginator(queryset, page_size)
articles_page = paginator.get_page(page)
# 统计数据
stats = {
'total_articles': Article.objects.count(),
'today_articles': Article.objects.filter(
created_at__date=timezone.now().date()
).count(),
'week_articles': Article.objects.filter(
created_at__gte=timezone.now() - timedelta(days=7)
).count(),
}
# 序列化数据
articles_data = []
for article in articles_page:
article_data = {
'id': article.id,
'title': article.title,
'url': article.url,
'content': article.content[:200] + '...' if len(article.content) > 200 else article.content,
'created_at': article.created_at.isoformat(),
'website': {
'id': article.website.id,
'name': article.website.name,
},
'media_files': article.media_files,
}
articles_data.append(article_data)
response_data = {
'articles': articles_data,
'pagination': {
'page': page,
'page_size': page_size,
'total_pages': paginator.num_pages,
'total_count': paginator.count,
'has_next': articles_page.has_next(),
'has_previous': articles_page.has_previous(),
},
'stats': stats
}
return api_response(data=response_data, message="获取文章列表成功")
except Exception as e:
logger.error(f"获取文章列表失败: {e}")
return api_response(message="获取文章列表失败", status=500, error=str(e))
@csrf_exempt
@require_http_methods(["GET"])
def api_article_detail(request, article_id):
"""获取文章详情"""
try:
article = Article.objects.select_related('website').get(id=article_id)
article_data = {
'id': article.id,
'title': article.title,
'url': article.url,
'content': article.content,
'created_at': article.created_at.isoformat(),
'website': {
'id': article.website.id,
'name': article.website.name,
'base_url': article.website.base_url,
},
'media_files': article.media_files,
}
return api_response(data=article_data, message="获取文章详情成功")
except Article.DoesNotExist:
return api_response(message="文章不存在", status=404, error="Article not found")
except Exception as e:
logger.error(f"获取文章详情失败: {e}")
return api_response(message="获取文章详情失败", status=500, error=str(e))
@csrf_exempt
@require_http_methods(["GET"])
def api_crawler_status(request):
"""获取爬虫状态"""
try:
# 获取分布式爬虫状态
nodes = distributed_crawler.get_available_nodes()
node_statuses = []
for node_id in nodes:
status = distributed_crawler.get_node_status(node_id)
node_statuses.append(status)
# 获取最近的批次
batches = distributed_crawler.get_all_batches()[:10]
# 获取任务统计
task_stats = {
'active_tasks': len([n for n in node_statuses if n['active_tasks'] > 0]),
'total_nodes': len(nodes),
'total_batches': len(batches),
}
response_data = {
'nodes': node_statuses,
'batches': batches,
'stats': task_stats,
}
return api_response(data=response_data, message="获取爬虫状态成功")
except Exception as e:
logger.error(f"获取爬虫状态失败: {e}")
return api_response(message="获取爬虫状态失败", status=500, error=str(e))
@csrf_exempt
@require_http_methods(["POST"])
def api_start_distributed_crawl(request):
"""启动分布式爬取"""
try:
data = json.loads(request.body)
website_ids = data.get('website_ids', [])
if not website_ids:
return api_response(message="请选择要爬取的网站", status=400, error="No websites selected")
# 启动分布式爬取
batch_id = distributed_crawler.distribute_crawl_tasks(website_ids)
if batch_id in ['no_websites', 'no_available_nodes']:
return api_response(message="无法启动分布式爬取", status=400, error=batch_id)
response_data = {
'batch_id': batch_id,
'website_ids': website_ids,
'status': 'started'
}
return api_response(data=response_data, message="分布式爬取已启动")
except json.JSONDecodeError:
return api_response(message="请求数据格式错误", status=400, error="Invalid JSON")
except Exception as e:
logger.error(f"启动分布式爬取失败: {e}")
return api_response(message="启动分布式爬取失败", status=500, error=str(e))
@csrf_exempt
@require_http_methods(["GET"])
def api_batch_status(request, batch_id):
"""获取批次状态"""
try:
batch_status = distributed_crawler.get_batch_status(batch_id)
if batch_status.get('status') == 'not_found':
return api_response(message="批次不存在", status=404, error="Batch not found")
return api_response(data=batch_status, message="获取批次状态成功")
except Exception as e:
logger.error(f"获取批次状态失败: {e}")
return api_response(message="获取批次状态失败", status=500, error=str(e))
@csrf_exempt
@require_http_methods(["GET", "POST"])
def api_cleanup_articles(request):
"""清理旧文章"""
# 如果是GET请求返回清理功能的描述信息
if request.method == "GET":
response_data = {
'description': '文章清理API',
'method': 'POST',
'parameters': {
'days': '保留天数默认30天'
},
'example': {
'days': 30
}
}
return api_response(data=response_data, message="API使用说明")
try:
data = json.loads(request.body)
days = data.get('days', 30)
# 启动清理任务
task = cleanup_old_articles.delay(days)
response_data = {
'task_id': task.id,
'days': days,
'status': 'started'
}
return api_response(data=response_data, message="清理任务已启动")
except json.JSONDecodeError:
return api_response(message="请求数据格式错误", status=400, error="Invalid JSON")
except Exception as e:
logger.error(f"启动清理任务失败: {e}")
return api_response(message="启动清理任务失败", status=500, error=str(e))
@csrf_exempt
@require_http_methods(["GET"])
def api_stats(request):
"""获取统计信息"""
try:
# 基础统计
total_websites = Website.objects.count()
total_articles = Article.objects.count()
enabled_websites = Website.objects.filter(enabled=True).count()
# 时间统计
today = timezone.now().date()
week_ago = timezone.now() - timedelta(days=7)
month_ago = timezone.now() - timedelta(days=30)
today_articles = Article.objects.filter(created_at__date=today).count()
week_articles = Article.objects.filter(created_at__gte=week_ago).count()
month_articles = Article.objects.filter(created_at__gte=month_ago).count()
# 网站统计
website_stats = []
for website in Website.objects.all():
website_stats.append({
'id': website.id,
'name': website.name,
'article_count': website.article_set.count(),
# 使用getattr安全访问last_crawl属性如果不存在则返回None
'last_crawl': website.last_crawl.isoformat() if getattr(website, 'last_crawl', None) else None,
})
# 分布式爬虫统计
nodes = distributed_crawler.get_available_nodes()
batches = distributed_crawler.get_all_batches()
response_data = {
'overview': {
'total_websites': total_websites,
'enabled_websites': enabled_websites,
'total_articles': total_articles,
'today_articles': today_articles,
'week_articles': week_articles,
'month_articles': month_articles,
},
'websites': website_stats,
'crawler': {
'active_nodes': len(nodes),
'total_batches': len(batches),
'recent_batches': batches[:5],
}
}
return api_response(data=response_data, message="获取统计信息成功")
except Exception as e:
logger.error(f"获取统计信息失败: {e}")
return api_response(message="获取统计信息失败", status=500, error=str(e))
@csrf_exempt
@require_http_methods(["POST"])
def export_articles(request):
"""导出文章"""
try:
data = json.loads(request.body)
article_ids = data.get('article_ids', [])
export_format = data.get('format', 'docx') # 默认改为docx格式
if not article_ids:
return api_response(message="请选择要导出的文章", status=400, error="No articles selected")
# 获取文章数据
articles = Article.objects.filter(id__in=article_ids).select_related('website')
if not articles.exists():
return api_response(message="未找到指定的文章", status=404, error="Articles not found")
import os # 添加导入
from django.conf import settings # 添加导入
if export_format == 'json':
# 导出为JSON格式
articles_data = []
for article in articles:
articles_data.append({
'id': article.id,
'title': article.title,
'url': article.url,
'content': article.content,
'created_at': article.created_at.isoformat(),
'website': {
'id': article.website.id,
'name': article.website.name,
},
'media_files': article.media_files,
})
response = HttpResponse(
json.dumps(articles_data, ensure_ascii=False, indent=2),
content_type='application/json'
)
response['Content-Disposition'] = 'attachment; filename="articles.json"'
return response
elif export_format == 'csv':
# 导出为CSV格式
output = io.StringIO()
writer = csv.writer(output)
writer.writerow(['ID', '标题', '网址', '内容', '创建时间', '网站'])
for article in articles:
writer.writerow([
article.id,
article.title,
article.url,
article.content[:1000] + '...' if len(article.content) > 1000 else article.content,
article.created_at.isoformat(),
article.website.name
])
response = HttpResponse(output.getvalue(), content_type='text/csv')
response['Content-Disposition'] = 'attachment; filename="articles.csv"'
return response
elif export_format == 'docx':
# 导出为Word格式每个文章一个文件夹
zip_buffer = io.BytesIO()
with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
for article in articles:
# 创建文章文件夹名称
safe_title = "".join(c for c in article.title if c.isalnum() or c in (' ','_','-')).rstrip()
folder_name = f"article_{article.id}_{safe_title}"[:50]
# 创建Word文档
doc = Document()
doc.add_heading(article.title, 0)
# 添加文章信息
doc.add_paragraph(f"网站: {article.website.name}")
doc.add_paragraph(f"网址: {article.url}")
doc.add_paragraph(f"发布时间: {article.pub_date.isoformat() if article.pub_date else 'N/A'}")
doc.add_paragraph(f"创建时间: {article.created_at.isoformat()}")
# 添加内容标题
doc.add_heading('内容:', level=1)
# 处理HTML内容
content_text = BeautifulSoup(article.content, 'html.parser').get_text()
doc.add_paragraph(content_text)
# 将文档保存到内存中
doc_buffer = io.BytesIO()
doc.save(doc_buffer)
doc_buffer.seek(0)
# 添加到ZIP文件
zip_file.writestr(f"{folder_name}/article.docx", doc_buffer.getvalue())
# 添加媒体文件(如果存在)
if article.media_files:
for media in article.media_files:
try:
# 如果是本地文件路径
if not media.startswith('http'):
media_path = os.path.join(settings.MEDIA_ROOT, media.lstrip('/'))
if os.path.exists(media_path):
zip_file.write(media_path, f"{folder_name}/media/{os.path.basename(media_path)}")
# 如果是URL格式的媒体文件
else:
import requests
from io import BytesIO
response = requests.get(media, timeout=10)
if response.status_code == 200:
image_stream = BytesIO(response.content)
media_filename = f"{folder_name}/media/{os.path.basename(media)}"
zip_file.writestr(media_filename, image_stream.getvalue())
except Exception:
# 忽略无法添加的媒体文件
pass
response = HttpResponse(zip_buffer.getvalue(), content_type='application/zip')
response['Content-Disposition'] = 'attachment; filename="articles.zip"'
return response
elif export_format == 'zip':
# 导出为ZIP包每个文章一个文件夹
zip_buffer = io.BytesIO()
with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
for article in articles:
# 创建文章文件夹名称
safe_title = "".join(c for c in article.title if c.isalnum() or c in (' ','_','-')).rstrip()
folder_name = f"article_{article.id}_{safe_title}"[:50]
# 创建Word文档
doc = Document()
doc.add_heading(article.title, 0)
# 添加文章信息
doc.add_paragraph(f"网站: {article.website.name}")
doc.add_paragraph(f"网址: {article.url}")
doc.add_paragraph(f"发布时间: {article.pub_date.isoformat() if article.pub_date else 'N/A'}")
doc.add_paragraph(f"创建时间: {article.created_at.isoformat()}")
# 添加内容标题
doc.add_heading('内容:', level=1)
# 处理HTML内容
content_text = BeautifulSoup(article.content, 'html.parser').get_text()
doc.add_paragraph(content_text)
# 将文档保存到内存中
doc_buffer = io.BytesIO()
doc.save(doc_buffer)
doc_buffer.seek(0)
# 添加到ZIP文件
zip_file.writestr(f"{folder_name}/article.docx", doc_buffer.getvalue())
# 添加媒体文件(如果存在)
if article.media_files:
for media in article.media_files:
try:
# 如果是本地文件路径
if not media.startswith('http'):
media_path = os.path.join(settings.MEDIA_ROOT, media.lstrip('/'))
if os.path.exists(media_path):
zip_file.write(media_path, f"{folder_name}/media/{os.path.basename(media_path)}")
# 如果是URL格式的媒体文件
else:
import requests
from io import BytesIO
response = requests.get(media, timeout=10)
if response.status_code == 200:
image_stream = BytesIO(response.content)
media_filename = f"{folder_name}/media/{os.path.basename(media)}"
zip_file.writestr(media_filename, image_stream.getvalue())
except Exception:
# 忽略无法添加的媒体文件
pass
response = HttpResponse(zip_buffer.getvalue(), content_type='application/zip')
response['Content-Disposition'] = 'attachment; filename="articles.zip"'
return response
else:
return api_response(message="不支持的导出格式", status=400, error="Unsupported format")
except json.JSONDecodeError:
return api_response(message="请求数据格式错误", status=400, error="Invalid JSON")
except Exception as e:
logger.error(f"导出文章失败: {e}")
return api_response(message="导出文章失败", status=500, error=str(e))

View File

@@ -4,3 +4,8 @@ from django.apps import AppConfig
class CoreConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
name = 'core'
def ready(self):
"""应用启动时执行"""
# 导入Admin扩展
import core.admin_extended

276
core/distributed_crawler.py Normal file
View File

@@ -0,0 +1,276 @@
"""
分布式爬虫模块
支持多节点爬虫集群,任务分发和结果聚合
"""
import json
import logging
import time
from typing import Dict, List, Optional, Any
from celery import group, chain
from django.conf import settings
from django.core.cache import cache
from django.db import transaction
from .models import Website, Article
from .tasks import crawl_website, crawl_all_websites
from .utils import full_site_crawler
logger = logging.getLogger(__name__)
class DistributedCrawler:
"""分布式爬虫管理器"""
def __init__(self):
self.cache_prefix = "crawler:distributed:"
self.task_timeout = getattr(settings, 'CRAWLER_TASK_TIMEOUT', 1800) # 30分钟
def get_node_status(self, node_id: str) -> Dict[str, Any]:
"""获取节点状态"""
cache_key = f"{self.cache_prefix}node:{node_id}:status"
status = cache.get(cache_key, {})
return {
'node_id': node_id,
'status': status.get('status', 'unknown'),
'last_heartbeat': status.get('last_heartbeat'),
'active_tasks': status.get('active_tasks', 0),
'completed_tasks': status.get('completed_tasks', 0),
'failed_tasks': status.get('failed_tasks', 0),
}
def register_node(self, node_id: str, capacity: int = 10) -> bool:
"""注册爬虫节点"""
cache_key = f"{self.cache_prefix}node:{node_id}:status"
status = {
'status': 'active',
'capacity': capacity,
'active_tasks': 0,
'completed_tasks': 0,
'failed_tasks': 0,
'last_heartbeat': time.time(),
'registered_at': time.time(),
}
cache.set(cache_key, status, timeout=3600) # 1小时过期
# 添加到节点列表
nodes_key = f"{self.cache_prefix}active_nodes"
nodes = cache.get(nodes_key, [])
if node_id not in nodes:
nodes.append(node_id)
cache.set(nodes_key, nodes, timeout=3600)
logger.info(f"注册爬虫节点: {node_id}, 容量: {capacity}")
return True
def unregister_node(self, node_id: str) -> bool:
"""注销爬虫节点"""
cache_key = f"{self.cache_prefix}node:{node_id}:status"
cache.delete(cache_key)
# 从节点列表移除
nodes_key = f"{self.cache_prefix}active_nodes"
nodes = cache.get(nodes_key, [])
if node_id in nodes:
nodes.remove(node_id)
cache.set(nodes_key, nodes, timeout=3600)
logger.info(f"注销爬虫节点: {node_id}")
return True
def heartbeat(self, node_id: str, active_tasks: int = 0) -> bool:
"""节点心跳"""
cache_key = f"{self.cache_prefix}node:{node_id}:status"
status = cache.get(cache_key, {})
if status:
status['last_heartbeat'] = time.time()
status['active_tasks'] = active_tasks
cache.set(cache_key, status, timeout=3600)
return True
def get_available_nodes(self) -> List[str]:
"""获取可用节点列表"""
nodes_key = f"{self.cache_prefix}active_nodes"
nodes = cache.get(nodes_key, [])
available_nodes = []
for node_id in nodes:
status = self.get_node_status(node_id)
if status['status'] == 'active':
# 检查心跳是否在5分钟内
if status['last_heartbeat'] and (time.time() - status['last_heartbeat']) < 300:
available_nodes.append(node_id)
return available_nodes
def distribute_crawl_tasks(self, websites: List[int], max_concurrent: int = 5) -> str:
"""分发爬虫任务到多个节点"""
if not websites:
return "no_websites"
available_nodes = self.get_available_nodes()
if not available_nodes:
logger.warning("没有可用的爬虫节点")
return "no_available_nodes"
# 创建任务批次
batch_id = f"batch_{int(time.time())}"
batch_key = f"{self.cache_prefix}batch:{batch_id}"
# 将网站分组分配给不同节点
tasks = []
for i, website_id in enumerate(websites):
node_id = available_nodes[i % len(available_nodes)]
task = crawl_website.apply_async(
args=[website_id],
kwargs={'node_id': node_id, 'batch_id': batch_id},
countdown=i * 2 # 错开启动时间
)
tasks.append(task)
# 保存批次信息
batch_info = {
'batch_id': batch_id,
'websites': websites,
'tasks': [task.id for task in tasks],
'nodes': available_nodes,
'status': 'running',
'created_at': time.time(),
'total_tasks': len(tasks),
'completed_tasks': 0,
'failed_tasks': 0,
}
cache.set(batch_key, batch_info, timeout=7200) # 2小时过期
logger.info(f"创建分布式爬虫批次: {batch_id}, 任务数: {len(tasks)}, 节点数: {len(available_nodes)}")
return batch_id
def get_batch_status(self, batch_id: str) -> Dict[str, Any]:
"""获取批次状态"""
batch_key = f"{self.cache_prefix}batch:{batch_id}"
batch_info = cache.get(batch_key, {})
if not batch_info:
return {'status': 'not_found'}
# 统计任务状态
completed = 0
failed = 0
running = 0
for task_id in batch_info.get('tasks', []):
task_result = cache.get(f"{self.cache_prefix}task:{task_id}")
if task_result:
if task_result.get('status') == 'completed':
completed += 1
elif task_result.get('status') == 'failed':
failed += 1
else:
running += 1
batch_info.update({
'completed_tasks': completed,
'failed_tasks': failed,
'running_tasks': running,
'progress': (completed + failed) / batch_info.get('total_tasks', 1) * 100
})
# 检查是否完成
if completed + failed >= batch_info.get('total_tasks', 0):
batch_info['status'] = 'completed'
cache.set(batch_key, batch_info, timeout=7200)
return batch_info
def get_all_batches(self) -> List[Dict[str, Any]]:
"""获取所有批次"""
pattern = f"{self.cache_prefix}batch:*"
batches = []
# 这里简化实现实际应该使用Redis的SCAN命令
for i in range(100): # 假设最多100个批次
batch_key = f"{self.cache_prefix}batch:batch_{i}"
batch_info = cache.get(batch_key)
if batch_info:
batches.append(batch_info)
return sorted(batches, key=lambda x: x.get('created_at', 0), reverse=True)
def cleanup_old_batches(self, max_age_hours: int = 24) -> int:
"""清理旧的批次数据"""
cutoff_time = time.time() - (max_age_hours * 3600)
cleaned = 0
for i in range(100):
batch_key = f"{self.cache_prefix}batch:batch_{i}"
batch_info = cache.get(batch_key)
if batch_info and batch_info.get('created_at', 0) < cutoff_time:
cache.delete(batch_key)
cleaned += 1
logger.info(f"清理了 {cleaned} 个旧批次")
return cleaned
class CrawlerNode:
"""爬虫节点"""
def __init__(self, node_id: str, capacity: int = 10):
self.node_id = node_id
self.capacity = capacity
self.distributed_crawler = DistributedCrawler()
self.active_tasks = 0
def start(self):
"""启动节点"""
self.distributed_crawler.register_node(self.node_id, self.capacity)
logger.info(f"爬虫节点 {self.node_id} 已启动")
def stop(self):
"""停止节点"""
self.distributed_crawler.unregister_node(self.node_id)
logger.info(f"爬虫节点 {self.node_id} 已停止")
def heartbeat(self):
"""发送心跳"""
self.distributed_crawler.heartbeat(self.node_id, self.active_tasks)
def process_task(self, website_id: int, batch_id: str = None) -> Dict[str, Any]:
"""处理爬虫任务"""
self.active_tasks += 1
start_time = time.time()
try:
# 执行爬虫任务
website = Website.objects.get(id=website_id)
result = full_site_crawler(website.base_url, website, max_pages=100)
# 记录任务结果
task_result = {
'status': 'completed',
'website_id': website_id,
'website_name': website.name,
'result': result,
'duration': time.time() - start_time,
'completed_at': time.time(),
}
logger.info(f"节点 {self.node_id} 完成网站 {website.name} 爬取")
except Exception as e:
task_result = {
'status': 'failed',
'website_id': website_id,
'error': str(e),
'duration': time.time() - start_time,
'failed_at': time.time(),
}
logger.error(f"节点 {self.node_id} 爬取网站 {website_id} 失败: {e}")
finally:
self.active_tasks -= 1
return task_result
# 全局分布式爬虫实例
distributed_crawler = DistributedCrawler()

765
core/keyword_crawler.py Normal file
View File

@@ -0,0 +1,765 @@
"""
关键词爬虫引擎
基于 crawler_engine.py 的关键词爬取方法改进
"""
import requests
import time
import re
import logging
import os
import urllib3
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from django.conf import settings
from django.utils import timezone
from django.core.files.base import ContentFile
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from .models import Website, CrawlTask, Article
from .utils import get_page_with_selenium, get_page_with_requests, check_keyword_in_content
# 禁用SSL警告
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# 设置日志记录器
logger = logging.getLogger(__name__)
class KeywordCrawler:
"""关键词爬虫引擎"""
def __init__(self, task_id, task_executor_instance=None):
self.task = CrawlTask.objects.get(id=task_id)
self.task_id = task_id
self.task_executor = task_executor_instance
self.keywords = [kw.strip() for kw in self.task.keyword.split(',') if kw.strip()] if self.task.keyword else []
# 创建带重试策略的会话
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
})
# 设置重试策略
retry_strategy = Retry(
total=3,
backoff_factor=1,
status_forcelist=[429, 500, 502, 503, 504],
)
adapter = HTTPAdapter(max_retries=retry_strategy)
self.session.mount("http://", adapter)
self.session.mount("https://", adapter)
# 设置超时
self.timeout = 15
def log(self, level, message, website=None):
"""记录日志"""
print(f"[{level.upper()}] {message}")
logger.log(getattr(logging, level.upper()), f"Task {self.task.id}: {message}")
def is_cancelled(self):
"""检查任务是否已被取消"""
if self.task_executor:
return self.task_executor.is_task_cancelled(self.task_id)
return False
def update_task_status(self, status, **kwargs):
"""更新任务状态"""
self.task.status = status
if status == 'running' and not self.task.started_at:
self.task.started_at = timezone.now()
elif status in ['completed', 'failed', 'cancelled']:
self.task.completed_at = timezone.now()
for key, value in kwargs.items():
setattr(self.task, key, value)
self.task.save()
def extract_text_content(self, soup):
"""提取文本内容,保持段落结构"""
# 移除脚本和样式标签
for script in soup(["script", "style"]):
script.decompose()
# 处理段落标签,保持段落结构
paragraphs = []
# 查找所有段落相关的标签
for element in soup.find_all(['p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'br']):
if element.name in ['p', 'div']:
text = element.get_text().strip()
if text:
paragraphs.append(text)
elif element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
text = element.get_text().strip()
if text:
paragraphs.append(f"\n{text}\n") # 标题前后加换行
elif element.name == 'br':
paragraphs.append('\n')
# 如果没有找到段落标签,使用原来的方法
if not paragraphs:
text = soup.get_text()
# 清理文本但保持换行
lines = []
for line in text.splitlines():
line = line.strip()
if line:
lines.append(line)
return '\n\n'.join(lines)
# 合并段落,用双换行分隔
content = '\n\n'.join(paragraphs)
# 清理多余的空行
content = re.sub(r'\n\s*\n\s*\n', '\n\n', content)
return content.strip()
def clean_url(self, url):
"""清理和修复URL"""
try:
# 处理空值或None
if not url or url is None:
return ""
# 修复常见的URL问题
# 将错误的编码字符恢复
url = str(url).replace('%C3%97', '×') # 修复 × 字符的错误编码
url = url.replace('%E2%80%93', '') # 修复 字符的错误编码
url = url.replace('%E2%80%94', '') # 修复 — 字符的错误编码
# 解析URL并重新构建
parsed = urlparse(url)
# 清理查询参数
if parsed.query:
# 处理查询参数中的编码问题
from urllib.parse import parse_qs, urlencode, unquote
query_params = parse_qs(parsed.query)
cleaned_params = {}
for key, values in query_params.items():
# 解码参数名
clean_key = unquote(key)
# 解码参数值
clean_values = [unquote(val) for val in values]
cleaned_params[clean_key] = clean_values
# 重新构建查询字符串
query_string = urlencode(cleaned_params, doseq=True)
else:
query_string = ''
# 重新构建URL
clean_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
if query_string:
clean_url += f"?{query_string}"
if parsed.fragment:
clean_url += f"#{parsed.fragment}"
return clean_url
except Exception as e:
self.log('warning', f'URL清理失败: {url}, 错误: {e}')
return url
def is_valid_article_url(self, url):
"""检查是否是有效的文章URL"""
try:
# 排除一些明显不是文章的URL
exclude_patterns = [
'javascript:', 'mailto:', '#', 'tel:',
'.pdf', '.doc', '.docx', '.xls', '.xlsx',
'.jpg', '.jpeg', '.png', '.gif', '.svg',
'.mp3', '.mp4', '.avi', '.mov'
]
url_lower = url.lower()
for pattern in exclude_patterns:
if pattern in url_lower:
return False
# 检查URL长度
if len(url) < 10:
return False
# 检查是否包含文章相关的关键词
article_keywords = ['article', 'news', 'content', 'detail', 'view', 'show', 'post']
url_lower = url.lower()
for keyword in article_keywords:
if keyword in url_lower:
return True
# 如果URL看起来像文章ID或路径也认为是有效的
if any(char.isdigit() for char in url) and len(url.split('/')) > 3:
return True
return False
except Exception:
return False
def find_article_links(self, soup, base_url):
"""查找文章链接"""
links = []
seen_urls = set() # 避免重复URL
# 常见的文章链接选择器
selectors = [
'a[href*="article"]',
'a[href*="news"]',
'a[href*="content"]',
'a[href*="detail"]',
'a[href*="view"]',
'a[href*="show"]',
'.news-list a',
'.article-list a',
'.content-list a',
'h3 a',
'h4 a',
'.title a',
'.list-item a'
]
for selector in selectors:
elements = soup.select(selector)
for element in elements:
href = element.get('href')
if href:
# 清理和修复URL
clean_href = self.clean_url(href)
full_url = urljoin(base_url, clean_href)
# 再次清理完整URL
full_url = self.clean_url(full_url)
# 检查URL是否有效且未重复
if (full_url not in seen_urls and
self.is_valid_article_url(full_url) and
full_url.startswith(('http://', 'https://'))):
title = element.get_text().strip()
if title and len(title) > 5: # 过滤掉太短的标题
links.append({
'url': full_url,
'title': title
})
seen_urls.add(full_url)
return links
def check_keyword_match(self, text, title):
"""检查关键字匹配 - 改进版本"""
matched_keywords = []
text_lower = text.lower()
title_lower = title.lower()
for keyword in self.keywords:
keyword_lower = keyword.lower()
# 使用改进的关键字检查函数
if check_keyword_in_content(text, keyword) or check_keyword_in_content(title, keyword):
matched_keywords.append(keyword)
return matched_keywords
def extract_article_content(self, url, soup):
"""提取文章内容"""
# 尝试多种内容选择器
content_selectors = [
'.article-content',
'.content',
'.article-body',
'.news-content',
'.main-content',
'.post-content',
'article',
'.detail-content',
'#content',
'.text',
'.box_con', # 新华网等网站使用
'.content_area', # 央视网等网站使用
]
content = ""
for selector in content_selectors:
element = soup.select_one(selector)
if element:
content = self.extract_text_content(element)
if len(content) > 100: # 确保内容足够长
break
# 如果没找到特定内容区域,使用整个页面
if not content or len(content) < 100:
content = self.extract_text_content(soup)
return content
def extract_publish_date(self, soup):
"""提取发布时间"""
date_selectors = [
'.publish-time',
'.pub-time',
'.date',
'.time',
'.publish-date',
'time[datetime]',
'.article-time',
'.news-time',
'.post-time',
'.create-time',
'.update-time',
'.time span',
'.date span',
'.info span',
'.meta span',
'.meta-info',
'.article-info span',
'.news-info span',
'.content-info span',
'.a-shijian',
'.l-time'
]
for selector in date_selectors:
elements = soup.select(selector)
for element in elements:
date_text = element.get_text().strip()
if element.get('datetime'):
date_text = element.get('datetime')
# 如果文本太短或为空,跳过
if not date_text or len(date_text) < 4:
continue
# 尝试解析日期
try:
from datetime import datetime
# 清理日期文本
date_text = re.sub(r'发布(时间|日期)[:]?', '', date_text).strip()
date_text = re.sub(r'时间[:]?', '', date_text).strip()
date_text = re.sub(r'日期[:]?', '', date_text).strip()
date_text = re.sub(r'发表于[:]?', '', date_text).strip()
date_text = re.sub(r'更新[:]?', '', date_text).strip()
date_text = re.sub(r'\s+', ' ', date_text).strip()
# 如果有 datetime 属性且是标准格式,直接使用
if element.get('datetime'):
datetime_attr = element.get('datetime')
# 尝试解析常见的日期时间格式
for fmt in [
'%Y-%m-%d %H:%M:%S',
'%Y-%m-%dT%H:%M:%S',
'%Y-%m-%dT%H:%M:%S%z',
'%Y-%m-%d %H:%M',
'%Y-%m-%d',
'%Y/%m/%d %H:%M:%S',
'%Y/%m/%d %H:%M',
'%Y/%m/%d',
'%Y年%m月%d%H:%M:%S',
'%Y年%m月%d%H:%M',
'%Y年%m月%d',
]:
try:
if '%z' in fmt and '+' not in datetime_attr and datetime_attr.endswith('Z'):
datetime_attr = datetime_attr[:-1] + '+0000'
parsed_date = datetime.strptime(datetime_attr, fmt)
if not timezone.is_aware(parsed_date):
parsed_date = timezone.make_aware(parsed_date)
return parsed_date
except ValueError:
continue
# 尝试解析从文本中提取的日期
for fmt in [
'%Y年%m月%d%H:%M:%S',
'%Y年%m月%d%H:%M',
'%Y年%m月%d',
'%Y-%m-%d %H:%M:%S',
'%Y-%m-%d %H:%M',
'%Y-%m-%d',
'%Y/%m/%d %H:%M:%S',
'%Y/%m/%d %H:%M',
'%Y/%m/%d',
'%m月%d%H:%M',
'%m月%d',
]:
try:
parsed_date = datetime.strptime(date_text, fmt)
# 如果没有年份,使用当前年份
if '%Y' not in fmt:
parsed_date = parsed_date.replace(year=datetime.now().year)
if not timezone.is_aware(parsed_date):
parsed_date = timezone.make_aware(parsed_date)
return parsed_date
except ValueError:
continue
# 如果以上格式都不匹配,尝试使用 dateutil 解析
try:
from dateutil import parser
if len(date_text) > 5 and not date_text.isdigit():
parsed_date = parser.parse(date_text)
if not timezone.is_aware(parsed_date):
parsed_date = timezone.make_aware(parsed_date)
return parsed_date
except:
pass
except Exception as e:
self.log('debug', f'解析日期失败: {date_text}, 错误: {str(e)}')
continue
return None
def extract_author(self, soup):
"""提取作者信息"""
author_selectors = [
'.author',
'.writer',
'.publisher',
'.byline',
'.article-author',
'.news-author',
'.source'
]
for selector in author_selectors:
element = soup.select_one(selector)
if element:
return element.get_text().strip()
return ""
def download_media_file(self, media_url, article, media_type='image', alt_text=''):
"""下载媒体文件 - 适配现有模型结构"""
try:
# 检查URL是否有效
if not media_url or not media_url.startswith(('http://', 'https://')):
return None
# 请求媒体文件
response = self.session.get(
media_url,
timeout=self.timeout,
verify=False,
stream=False
)
response.raise_for_status()
# 获取文件信息
content_type = response.headers.get('content-type', '')
file_size = len(response.content)
# 确定文件扩展名
file_extension = self.get_file_extension_from_url(media_url, content_type)
# 生成文件名
existing_media_count = len(article.media_files) if article.media_files else 0
filename = f"media_{article.id}_{existing_media_count}{file_extension}"
# 创建媒体文件信息字典
media_info = {
'type': media_type,
'original_url': media_url,
'filename': filename,
'file_size': file_size,
'mime_type': content_type,
'alt_text': alt_text,
'downloaded_at': timezone.now().isoformat()
}
# 更新文章的媒体文件列表
if not article.media_files:
article.media_files = [media_info]
else:
article.media_files.append(media_info)
# 保存文件到本地(这里简化处理,实际项目中可能需要更复杂的文件存储)
self.log('info', f'媒体文件已记录: {filename} ({media_type})')
return media_info
except Exception as e:
self.log('error', f'下载媒体文件失败 {media_url}: {str(e)}')
return None
def get_file_extension_from_url(self, url, content_type):
"""从URL或内容类型获取文件扩展名"""
# 从URL获取扩展名
parsed_url = urlparse(url)
path = parsed_url.path
if '.' in path:
return os.path.splitext(path)[1]
# 从内容类型获取扩展名
content_type_map = {
'image/jpeg': '.jpg',
'image/jpg': '.jpg',
'image/png': '.png',
'image/gif': '.gif',
'image/webp': '.webp',
'image/svg+xml': '.svg',
'video/mp4': '.mp4',
'video/avi': '.avi',
'video/mov': '.mov',
'video/wmv': '.wmv',
'video/flv': '.flv',
'video/webm': '.webm',
'audio/mp3': '.mp3',
'audio/wav': '.wav',
'audio/ogg': '.ogg',
'application/pdf': '.pdf',
'application/msword': '.doc',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
}
return content_type_map.get(content_type.lower(), '.bin')
def extract_and_download_media(self, soup, article, base_url):
"""提取并下载页面中的媒体文件"""
media_files = []
# 提取图片
images = soup.find_all('img')
self.log('info', f'找到 {len(images)} 个图片标签')
for img in images:
src = img.get('src')
if src:
# 处理相对URL
if src.startswith('//'):
src = 'https:' + src
elif src.startswith('/'):
src = urljoin(base_url, src)
elif not src.startswith(('http://', 'https://')):
src = urljoin(base_url, src)
alt_text = img.get('alt', '')
media_file = self.download_media_file(src, article, 'image', alt_text)
if media_file:
media_files.append(media_file)
# 提取视频
videos = soup.find_all(['video', 'source'])
for video in videos:
src = video.get('src')
if src:
# 处理相对URL
if src.startswith('//'):
src = 'https:' + src
elif src.startswith('/'):
src = urljoin(base_url, src)
elif not src.startswith(('http://', 'https://')):
src = urljoin(base_url, src)
media_file = self.download_media_file(src, article, 'video')
if media_file:
media_files.append(media_file)
return media_files
def crawl_website(self, website):
"""爬取单个网站"""
self.log('info', f'开始爬取网站: {website.name}')
try:
# 请求主页
response = self.session.get(
website.base_url,
timeout=self.timeout,
verify=False
)
response.raise_for_status()
# 检查内容编码
if response.encoding != 'utf-8':
content_type = response.headers.get('content-type', '')
if 'charset=' in content_type:
charset = content_type.split('charset=')[-1]
response.encoding = charset
else:
response.encoding = 'utf-8'
soup = BeautifulSoup(response.content, 'html.parser')
# 查找文章链接
article_links = self.find_article_links(soup, website.base_url)
self.log('info', f'找到 {len(article_links)} 个文章链接')
crawled_count = 0
for link_info in article_links:
# 检查任务是否已被取消
if self.is_cancelled():
self.log('info', '任务已被取消,停止处理文章')
return crawled_count
try:
# 清理和验证URL
clean_url = self.clean_url(link_info['url'])
# 检查URL是否仍然有效
if not self.is_valid_article_url(clean_url):
self.log('warning', f'跳过无效URL: {clean_url}')
continue
self.log('info', f'正在处理文章: {clean_url}')
# 请求文章页面
article_response = self.session.get(
clean_url,
timeout=self.timeout,
verify=False
)
article_response.raise_for_status()
# 检查内容编码
if article_response.encoding != 'utf-8':
content_type = article_response.headers.get('content-type', '')
if 'charset=' in content_type:
charset = content_type.split('charset=')[-1]
article_response.encoding = charset
else:
article_response.encoding = 'utf-8'
article_soup = BeautifulSoup(article_response.content, 'html.parser')
# 提取内容
content = self.extract_article_content(clean_url, article_soup)
title = link_info['title']
# 检查关键字匹配
matched_keywords = self.check_keyword_match(content, title)
if matched_keywords:
# 提取其他信息
publish_date = self.extract_publish_date(article_soup)
author = self.extract_author(article_soup)
# 检查是否已存在相同URL的文章
existing_article = Article.objects.filter(
url=clean_url
).first()
if existing_article:
# 如果已存在,更新现有记录
existing_article.title = title
existing_article.content = content
existing_article.pub_date = publish_date
existing_article.media_files = [] # 重置媒体文件列表
existing_article.save()
# 更新媒体文件
media_files = self.extract_and_download_media(article_soup, existing_article, clean_url)
self.log('info', f'更新已存在的文章: {title[:50]}...')
else:
# 保存新内容
article = Article.objects.create(
website=website,
title=title,
content=content,
url=clean_url,
pub_date=publish_date,
media_files=[]
)
# 提取并下载媒体文件
media_files = self.extract_and_download_media(article_soup, article, clean_url)
self.log('info', f'保存新文章: {title[:50]}...')
crawled_count += 1
# 请求间隔
time.sleep(1)
except requests.exceptions.HTTPError as e:
if e.response.status_code == 404:
self.log('warning', f'文章不存在 (404): {clean_url}')
elif e.response.status_code == 403:
self.log('warning', f'访问被拒绝 (403): {clean_url}')
elif e.response.status_code == 429:
self.log('warning', f'请求过于频繁 (429): {clean_url}')
time.sleep(5) # 等待更长时间
else:
self.log('error', f'HTTP错误 {e.response.status_code}: {clean_url}')
continue
except requests.exceptions.Timeout as e:
self.log('warning', f'请求超时: {clean_url}')
continue
except requests.exceptions.ConnectionError as e:
self.log('warning', f'连接错误: {clean_url}')
continue
except Exception as e:
self.log('error', f'处理文章失败 {clean_url}: {str(e)}')
continue
self.log('info', f'网站爬取完成,共保存 {crawled_count} 篇文章')
return crawled_count
except Exception as e:
self.log('error', f'爬取网站失败: {str(e)}')
return 0
def run(self):
"""运行爬取任务"""
self.log('info', f'开始执行关键词爬取任务: {self.task.name}')
self.update_task_status('running')
total_crawled = 0
websites = self.task.websites.all()
self.task.total_pages = websites.count()
self.task.save()
for website in websites:
# 检查任务是否已被取消
if self.is_cancelled():
self.log('info', '任务已被取消,停止爬取')
self.update_task_status('cancelled', error_message='任务被取消')
return
try:
crawled_count = self.crawl_website(website)
total_crawled += crawled_count
self.task.crawled_pages += 1
self.task.save()
# 再次检查任务是否已被取消
if self.is_cancelled():
self.log('info', '任务已被取消,停止爬取')
self.update_task_status('cancelled', error_message='任务被取消')
return
except Exception as e:
self.log('error', f'爬取网站 {website.name} 时发生错误: {str(e)}')
continue
# 更新任务状态
if total_crawled > 0:
self.update_task_status('completed')
self.log('info', f'关键词爬取任务完成,共爬取 {total_crawled} 篇文章')
else:
self.update_task_status('failed', error_message='没有找到匹配的内容')
self.log('error', '关键词爬取任务失败,没有找到匹配的内容')
def run_keyword_crawl_task(task_id, task_executor_instance=None):
"""运行关键词爬取任务"""
try:
crawler = KeywordCrawler(task_id, task_executor_instance)
crawler.run()
return f"关键词爬取任务 {task_id} 执行完成"
except Exception as e:
# 记录异常到日志
logger.error(f"执行关键词爬取任务 {task_id} 时发生异常: {str(e)}", exc_info=True)
task = CrawlTask.objects.get(id=task_id)
task.status = 'failed'
task.error_message = str(e)
task.completed_at = timezone.now()
task.save()
return f"关键词爬取任务 {task_id} 执行失败: {str(e)}"

View File

@@ -0,0 +1,77 @@
from django.core.management.base import BaseCommand
from django.core.management import call_command
from core.models import Website
class Command(BaseCommand):
help = "批量爬取所有中央主流媒体"
def add_arguments(self, parser):
parser.add_argument('--media', type=str, help='指定要爬取的媒体,用逗号分隔')
parser.add_argument('--platform', type=str, default='all',
help='指定平台类型: all(全部), web(网站)')
def handle(self, *args, **options):
media_list = options['media']
platform = options['platform']
# 所有中央主流媒体配置
all_media = {
'rmrb': 'crawl_rmrb',
'xinhua': 'crawl_xinhua',
'cctv': 'crawl_cctv',
'qiushi': 'crawl_qiushi',
'pla': 'crawl_pla',
'gmrb': 'crawl_gmrb',
'jjrb': 'crawl_jjrb',
'chinadaily': 'crawl_chinadaily',
'grrb': 'crawl_grrb',
'kjrb': 'crawl_kjrb',
'rmzxb': 'crawl_rmzxb',
'zgjwjc': 'crawl_zgjwjc',
'chinanews': 'crawl_chinanews',
'xxsb': 'crawl_xxsb',
'zgqnb': 'crawl_zgqnb',
'zgfnb': 'crawl_zgfnb',
'fzrb': 'crawl_fzrb',
'nmrb': 'crawl_nmrb',
'xuexi': 'crawl_xuexi',
'qizhi': 'crawl_qizhi',
'china': 'crawl_china'
}
# 如果指定了特定媒体,则只爬取指定的媒体
if media_list:
target_media = [media.strip() for media in media_list.split(',')]
else:
target_media = list(all_media.keys())
self.stdout.write(f"开始批量爬取 {len(target_media)} 家中央主流媒体...")
for media in target_media:
if media in all_media:
command_name = all_media[media]
try:
self.stdout.write(f"正在爬取: {media}")
call_command(command_name, platform=platform)
self.stdout.write(self.style.SUCCESS(f"完成爬取: {media}"))
except Exception as e:
self.stdout.write(self.style.ERROR(f"爬取 {media} 失败: {e}"))
else:
self.stdout.write(self.style.WARNING(f"未知媒体: {media}"))
self.stdout.write(self.style.SUCCESS("所有中央主流媒体爬取完成"))
# 显示统计信息
total_websites = Website.objects.count()
total_articles = sum([website.article_set.count() for website in Website.objects.all()])
self.stdout.write(f"统计信息:")
self.stdout.write(f"- 总网站数: {total_websites}")
self.stdout.write(f"- 总文章数: {total_articles}")
# 显示各媒体文章数量
self.stdout.write(f"各媒体文章数量:")
for website in Website.objects.all():
article_count = website.article_set.count()
self.stdout.write(f"- {website.name}: {article_count}")

View File

@@ -0,0 +1,266 @@
import json
from django.core.management.base import BaseCommand
from core.utils import full_site_crawler, crawl_by_keyword, WEBSITE_SEARCH_CONFIGS
class Command(BaseCommand):
help = '一键爬取所有网站'
def add_arguments(self, parser):
parser.add_argument(
'--mode', '-m',
type=str,
default='both',
choices=['full', 'keyword', 'both'],
help='爬取模式: full(全站爬取), keyword(关键词搜索), both(两者都执行)'
)
parser.add_argument(
'--keyword', '-k',
type=str,
help='关键词搜索的关键词'
)
parser.add_argument(
'--websites', '-w',
type=str,
nargs='*',
help='指定要爬取的网站列表'
)
parser.add_argument(
'--max-pages', '-p',
type=int,
default=500,
help='全站爬取的最大页数'
)
parser.add_argument(
'--max-search-pages', '-P',
type=int,
default=10,
help='关键词搜索的最大页数'
)
parser.add_argument(
'--max-articles', '-a',
type=int,
default=100,
help='关键词搜索的最大文章数'
)
parser.add_argument(
'--start-date', '-s',
type=str,
help='开始日期 (YYYY-MM-DD)'
)
parser.add_argument(
'--end-date', '-e',
type=str,
help='结束日期 (YYYY-MM-DD)'
)
parser.add_argument(
'--output', '-o',
type=str,
help='将结果保存到JSON文件'
)
parser.add_argument(
'--skip-existing', '-S',
action='store_true',
help='跳过已存在的网站'
)
parser.add_argument(
'--list-websites', '-l',
action='store_true',
help='列出所有支持的网站'
)
def handle(self, *args, **options):
# 列出支持的网站
if options['list_websites']:
self.stdout.write(self.style.SUCCESS("支持的网站列表:"))
for i, website in enumerate(WEBSITE_SEARCH_CONFIGS.keys(), 1):
self.stdout.write(f"{i:2d}. {website}")
return
mode = options['mode']
keyword = options['keyword']
websites = options['websites']
max_pages = options['max_pages']
max_search_pages = options['max_search_pages']
max_articles = options['max_articles']
start_date = options['start_date']
end_date = options['end_date']
output_file = options['output']
skip_existing = options['skip_existing']
# 验证网站名称
if websites:
# 确保websites是列表类型
if isinstance(websites, str):
websites = [websites]
invalid_websites = [w for w in websites if w not in WEBSITE_SEARCH_CONFIGS]
if invalid_websites:
# 确保invalid_websites是可迭代的
if isinstance(invalid_websites, str):
invalid_websites = [invalid_websites]
self.stdout.write(
self.style.ERROR(f"不支持的网站: {', '.join(invalid_websites)}")
)
self.stdout.write("使用 --list-websites 查看支持的网站列表")
return
# 确定要爬取的网站列表
target_websites = websites if websites else list(WEBSITE_SEARCH_CONFIGS.keys())
# 验证关键词模式
if mode in ['keyword', 'both'] and not keyword:
self.stdout.write(
self.style.ERROR("关键词模式需要指定 --keyword 参数")
)
return
self.stdout.write(f"开始一键爬取任务...")
self.stdout.write(f"爬取模式: {mode}")
# 确保target_websites是可迭代的
if isinstance(target_websites, str):
target_websites = [target_websites]
self.stdout.write(f"目标网站: {', '.join(target_websites)}")
if keyword:
self.stdout.write(f"关键词: {keyword}")
if start_date:
self.stdout.write(f"开始日期: {start_date}")
if end_date:
self.stdout.write(f"结束日期: {end_date}")
all_results = {
"mode": mode,
"websites": target_websites,
"keyword": keyword,
"start_date": start_date,
"end_date": end_date,
"full_crawl_results": {},
"keyword_crawl_results": {},
"summary": {
"total_websites": len(target_websites),
"full_crawl_success": 0,
"full_crawl_failed": 0,
"keyword_crawl_success": 0,
"keyword_crawl_failed": 0
}
}
try:
for website_name in target_websites:
self.stdout.write(f"\n{'='*50}")
self.stdout.write(f"开始处理网站: {website_name}")
self.stdout.write(f"{'='*50}")
# 获取或创建网站对象
from core.models import Website
website, created = Website.objects.get_or_create(
name=website_name,
defaults={
'base_url': WEBSITE_SEARCH_CONFIGS[website_name]["search_url"],
'enabled': True
}
)
if not created and skip_existing:
self.stdout.write(f"跳过已存在的网站: {website_name}")
continue
website_results = {
"full_crawl": None,
"keyword_crawl": None
}
# 全站爬取
if mode in ['full', 'both']:
self.stdout.write(f"\n开始全站爬取: {website_name}")
try:
full_site_crawler(
WEBSITE_SEARCH_CONFIGS[website_name]["search_url"],
website,
max_pages=max_pages
)
self.stdout.write(self.style.SUCCESS(f"全站爬取完成: {website_name}"))
website_results["full_crawl"] = {"status": "success"}
all_results["summary"]["full_crawl_success"] += 1
except Exception as e:
self.stdout.write(self.style.ERROR(f"全站爬取失败: {website_name}, 错误: {e}"))
website_results["full_crawl"] = {"status": "failed", "error": str(e)}
all_results["summary"]["full_crawl_failed"] += 1
# 关键词爬取
if mode in ['keyword', 'both']:
self.stdout.write(f"\n开始关键词爬取: {website_name}")
try:
keyword_results = crawl_by_keyword(
keyword=keyword,
website_names=[website_name],
max_pages=max_search_pages,
start_date=start_date,
end_date=end_date,
max_articles=max_articles
)
website_results["keyword_crawl"] = keyword_results
if keyword_results["success_count"] > 0:
all_results["summary"]["keyword_crawl_success"] += 1
else:
all_results["summary"]["keyword_crawl_failed"] += 1
except Exception as e:
self.stdout.write(self.style.ERROR(f"关键词爬取失败: {website_name}, 错误: {e}"))
website_results["keyword_crawl"] = {"status": "failed", "error": str(e)}
all_results["summary"]["keyword_crawl_failed"] += 1
all_results["full_crawl_results"][website_name] = website_results["full_crawl"]
all_results["keyword_crawl_results"][website_name] = website_results["keyword_crawl"]
# 显示最终结果摘要
self.stdout.write(f"\n{'='*50}")
self.stdout.write(self.style.SUCCESS("一键爬取完成!"))
self.stdout.write(f"{'='*50}")
self.stdout.write(f"总网站数: {all_results['summary']['total_websites']}")
if mode in ['full', 'both']:
self.stdout.write(f"全站爬取 - 成功: {all_results['summary']['full_crawl_success']}, "
f"失败: {all_results['summary']['full_crawl_failed']}")
if mode in ['keyword', 'both']:
self.stdout.write(f"关键词爬取 - 成功: {all_results['summary']['keyword_crawl_success']}, "
f"失败: {all_results['summary']['keyword_crawl_failed']}")
# 显示各网站详细结果
self.stdout.write("\n各网站详细结果:")
for website_name in target_websites:
self.stdout.write(f"\n{website_name}:")
if mode in ['full', 'both']:
full_result = all_results["full_crawl_results"][website_name]
if full_result and full_result.get("status") == "success":
self.stdout.write(self.style.SUCCESS(" 全站爬取: 成功"))
elif full_result:
self.stdout.write(self.style.ERROR(f" 全站爬取: 失败 - {full_result.get('error', '未知错误')}"))
if mode in ['keyword', 'both']:
keyword_result = all_results["keyword_crawl_results"][website_name]
if keyword_result and "success_count" in keyword_result:
self.stdout.write(f" 关键词爬取: 成功 {keyword_result['success_count']} 篇, "
f"失败 {keyword_result['failed_count']}")
elif keyword_result and keyword_result.get("status") == "failed":
self.stdout.write(self.style.ERROR(f" 关键词爬取: 失败 - {keyword_result.get('error', '未知错误')}"))
# 保存结果到文件
if output_file:
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(all_results, f, ensure_ascii=False, indent=2)
self.stdout.write(f"\n结果已保存到: {output_file}")
except Exception as e:
self.stdout.write(self.style.ERROR(f"一键爬取过程中出现错误: {e}"))
raise

View File

@@ -0,0 +1,168 @@
import json
from django.core.management.base import BaseCommand
from core.utils import crawl_by_keyword, WEBSITE_SEARCH_CONFIGS
class Command(BaseCommand):
help = '根据关键词爬取文章'
def add_arguments(self, parser):
parser.add_argument(
'--keyword', '-k',
type=str,
required=True,
help='搜索关键词'
)
parser.add_argument(
'--websites', '-w',
type=str,
nargs='*',
help='指定要爬取的网站列表'
)
parser.add_argument(
'--max-pages', '-p',
type=int,
default=10,
help='每个网站最大搜索页数'
)
parser.add_argument(
'--max-articles', '-m',
type=int,
default=100,
help='最大文章数量'
)
parser.add_argument(
'--start-date', '-s',
type=str,
help='开始日期 (YYYY-MM-DD)'
)
parser.add_argument(
'--end-date', '-e',
type=str,
help='结束日期 (YYYY-MM-DD)'
)
parser.add_argument(
'--historical', '-H',
action='store_true',
help='使用历史文章爬取模式'
)
parser.add_argument(
'--list-websites', '-l',
action='store_true',
help='列出所有支持的网站'
)
parser.add_argument(
'--output', '-o',
type=str,
help='将结果保存到JSON文件'
)
def handle(self, *args, **options):
# 列出支持的网站
if options['list_websites']:
self.stdout.write(self.style.SUCCESS("支持的网站列表:"))
for i, website in enumerate(WEBSITE_SEARCH_CONFIGS.keys(), 1):
self.stdout.write(f"{i:2d}. {website}")
return
keyword = options['keyword']
if not keyword:
self.stdout.write(self.style.ERROR("必须指定 --keyword 参数"))
return
websites = options['websites']
max_pages = options['max_pages']
max_articles = options['max_articles']
start_date = options['start_date']
end_date = options['end_date']
historical = options['historical']
output_file = options['output']
# 验证网站名称
if websites:
# 确保websites是列表类型
if isinstance(websites, str):
websites = [websites]
invalid_websites = [w for w in websites if w not in WEBSITE_SEARCH_CONFIGS]
if invalid_websites:
# 确保invalid_websites是可迭代的
if isinstance(invalid_websites, str):
invalid_websites = [invalid_websites]
self.stdout.write(
self.style.ERROR(f"不支持的网站: {', '.join(invalid_websites)}")
)
self.stdout.write("使用 --list-websites 查看支持的网站列表")
return
self.stdout.write(f"开始爬取任务...")
self.stdout.write(f"关键词: {keyword}")
# 确保websites是可迭代的
if websites:
if isinstance(websites, str):
websites = [websites]
self.stdout.write(f"目标网站: {', '.join(websites)}")
else:
self.stdout.write(f"目标网站: 所有支持的网站 ({len(WEBSITE_SEARCH_CONFIGS)}个)")
if start_date:
self.stdout.write(f"开始日期: {start_date}")
if end_date:
self.stdout.write(f"结束日期: {end_date}")
self.stdout.write(f"最大页数: {max_pages}")
self.stdout.write(f"最大文章数: {max_articles}")
try:
if historical:
# 历史文章爬取模式
self.stdout.write(self.style.WARNING("使用历史文章爬取模式"))
from core.utils import crawl_historical_articles
results = crawl_historical_articles(
website_names=websites,
start_date=start_date,
end_date=end_date,
max_articles_per_site=max_articles
)
else:
# 关键词搜索模式
results = crawl_by_keyword(
keyword=keyword,
website_names=websites,
max_pages=max_pages,
start_date=start_date,
end_date=end_date,
max_articles=max_articles
)
# 显示结果摘要
self.stdout.write(self.style.SUCCESS("\n爬取完成!"))
self.stdout.write(f"总文章数: {results['total_articles']}")
self.stdout.write(f"成功: {results['success_count']}")
self.stdout.write(f"失败: {results['failed_count']}")
# 显示各网站详细结果
self.stdout.write("\n各网站结果:")
for website, result in results['website_results'].items():
status = self.style.SUCCESS if result['success'] > 0 else self.style.WARNING
self.stdout.write(
status(f" {website}: 找到 {result['found_urls']} 篇, "
f"成功 {result['success']}, 失败 {result['failed']}")
)
if 'error' in result:
self.stdout.write(self.style.ERROR(f" 错误: {result['error']}"))
# 保存结果到文件
if output_file:
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=2)
self.stdout.write(f"\n结果已保存到: {output_file}")
except Exception as e:
self.stdout.write(self.style.ERROR(f"爬取过程中出现错误: {e}"))
raise

View File

@@ -0,0 +1,61 @@
from django.core.management.base import BaseCommand
from core.models import Website
from core.utils import full_site_crawler
# jimmy.fang:20250815: 因 CCTV 的视频有做加密动作,无法下载,移除支持
class Command(BaseCommand):
help = "全站递归爬取 中央广播电视总台及其子网站、客户端、新媒体平台"
def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all',
choices=['cctvnews', 'all'],
help='选择爬取平台: cctv(央视网), cctvnews(央视新闻), all(全部)')
def handle(self, *args, **options):
platform = options['platform']
# 中央广播电视总台各平台配置
platforms = {
# jimmy.fang:20250815: 因 CCTV 的视频有做加密动作,无法下载,移除支持
# 'cctv': {
# 'name': '央视网',
# 'base_url': 'https://www.cctv.com',
# 'start_url': 'https://www.cctv.com',
# 'article_selector': 'a'
# },
'cctvnews': {
'name': '央视新闻',
'base_url': 'https://news.cctv.com',
'start_url': 'https://news.cctv.com',
'article_selector': 'a'
}
}
if platform == 'all':
target_platforms = platforms.values()
else:
target_platforms = [platforms[platform]]
for platform_config in target_platforms:
website, created = Website.objects.get_or_create(
name=platform_config['name'],
defaults={
'base_url': platform_config['base_url'],
'article_list_url': platform_config['start_url'],
'article_selector': platform_config['article_selector']
}
)
# 确保更新已存在的网站对象的配置
if not created:
website.base_url = platform_config['base_url']
website.article_list_url = platform_config['start_url']
website.article_selector = platform_config['article_selector']
website.save()
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
full_site_crawler(platform_config['start_url'], website, max_pages=500)
self.stdout.write(f"完成爬取: {platform_config['name']}")
self.stdout.write(self.style.SUCCESS("中央广播电视总台所有平台爬取完成"))

View File

@@ -0,0 +1,54 @@
from django.core.management.base import BaseCommand
from core.models import Website
from core.utils import full_site_crawler
class Command(BaseCommand):
help = "全站递归爬取 中国网主网"
def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all',
choices=['china', 'all'],
help='选择爬取平台: china(中国网主网), all(全部)')
def handle(self, *args, **options):
platform = options['platform']
# 中国网各平台配置
platforms = {
'china': {
'name': '中国网',
'base_url': 'http://www.china.com.cn',
'start_url': 'http://www.china.com.cn',
'article_selector': 'a'
},
}
if platform == 'all':
target_platforms = platforms.values()
else:
target_platforms = [platforms[platform]]
for platform_config in target_platforms:
website, created = Website.objects.get_or_create(
name=platform_config['name'],
defaults={
'base_url': platform_config['base_url'],
'article_list_url': platform_config['start_url'],
'article_selector': platform_config['article_selector']
}
)
# 确保更新已存在的网站对象的配置
if not created:
website.base_url = platform_config['base_url']
website.article_list_url = platform_config['start_url']
website.article_selector = platform_config['article_selector']
website.save()
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
full_site_crawler(platform_config['start_url'], website, max_pages=500)
self.stdout.write(f"完成爬取: {platform_config['name']}")
self.stdout.write(self.style.SUCCESS("中国网所有平台爬取完成"))

View File

@@ -0,0 +1,54 @@
from django.core.management.base import BaseCommand
from core.models import Website
from core.utils import full_site_crawler
class Command(BaseCommand):
help = "全站递归爬取 中国日报平台"
def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all',
choices=['chinadaily','all'],
help='选择爬取平台: chinadaily(中国日报), all(全部)')
def handle(self, *args, **options):
platform = options['platform']
# 中国日报各平台配置
platforms = {
'chinadaily': {
'name': '中国日报',
'base_url': 'https://www.chinadaily.com.cn',
'start_url': 'https://www.chinadaily.com.cn',
'article_selector': 'a'
},
}
if platform == 'all':
target_platforms = platforms.values()
else:
target_platforms = [platforms[platform]]
for platform_config in target_platforms:
website, created = Website.objects.get_or_create(
name=platform_config['name'],
defaults={
'base_url': platform_config['base_url'],
'article_list_url': platform_config['start_url'],
'article_selector': platform_config['article_selector']
}
)
# 确保更新已存在的网站对象的配置
if not created:
website.base_url = platform_config['base_url']
website.article_list_url = platform_config['start_url']
website.article_selector = platform_config['article_selector']
website.save()
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
full_site_crawler(platform_config['start_url'], website, max_pages=500)
self.stdout.write(f"完成爬取: {platform_config['name']}")
self.stdout.write(self.style.SUCCESS("中国日报所有平台爬取完成"))

View File

@@ -0,0 +1,53 @@
from django.core.management.base import BaseCommand
from core.models import Website
from core.utils import full_site_crawler
class Command(BaseCommand):
help = "全站递归爬取 中国新闻社平台"
def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all',
choices=['chinanews', 'all'],
help='选择爬取平台: chinanews(中国新闻社), all(全部)')
def handle(self, *args, **options):
platform = options['platform']
# 中国新闻社各平台配置
platforms = {
'chinanews': {
'name': '中国新闻社',
'base_url': 'https://www.chinanews.com.cn',
'start_url': 'https://www.chinanews.com.cn',
'article_selector': 'a'
},
}
if platform == 'all':
target_platforms = platforms.values()
else:
target_platforms = [platforms[platform]]
for platform_config in target_platforms:
website, created = Website.objects.get_or_create(
name=platform_config['name'],
defaults={
'base_url': platform_config['base_url'],
'article_list_url': platform_config['start_url'],
'article_selector': platform_config['article_selector']
}
)
# 确保更新已存在的网站对象的配置
if not created:
website.base_url = platform_config['base_url']
website.article_list_url = platform_config['start_url']
website.article_selector = platform_config['article_selector']
website.save()
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
full_site_crawler(platform_config['start_url'], website, max_pages=500)
self.stdout.write(f"完成爬取: {platform_config['name']}")
self.stdout.write(self.style.SUCCESS("中国新闻社所有平台爬取完成"))

View File

@@ -0,0 +1,53 @@
from django.core.management.base import BaseCommand
from core.models import Website
from core.utils import full_site_crawler
class Command(BaseCommand):
help = "全站递归爬取 中国政府网及其子网站"
def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all',
choices=['govcn', 'all'],
help='选择爬取平台: govcn(中国政府网), all(全部)')
def handle(self, *args, **options):
platform = options['platform']
# 中国政府网各平台配置
platforms = {
'govcn': {
'name': '中国政府网',
'base_url': 'https://www.gov.cn/',
'start_url': 'https://www.gov.cn/',
'article_selector': 'a'
},
}
if platform == 'all':
target_platforms = platforms.values()
else:
target_platforms = [platforms[platform]]
for platform_config in target_platforms:
website, created = Website.objects.get_or_create(
name=platform_config['name'],
defaults={
'base_url': platform_config['base_url'],
'article_list_url': platform_config['start_url'],
'article_selector': platform_config['article_selector']
}
)
# 确保更新已存在的网站对象的配置
if not created:
website.base_url = platform_config['base_url']
website.article_list_url = platform_config['start_url']
website.article_selector = platform_config['article_selector']
website.save()
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
full_site_crawler(platform_config['start_url'], website, max_pages=500)
self.stdout.write(f"完成爬取: {platform_config['name']}")
self.stdout.write(self.style.SUCCESS("中国政府网所有平台爬取完成"))

View File

@@ -0,0 +1,53 @@
from django.core.management.base import BaseCommand
from core.models import Website
from core.utils import full_site_crawler
class Command(BaseCommand):
help = "全站递归爬取 东方烟草报"
def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all',
choices=['eastobacco', 'all'],
help='选择爬取平台: eastobacco(东方烟草报), all(全部)')
def handle(self, *args, **options):
platform = options['platform']
# 东方烟草报各平台配置
platforms = {
'eastobacco': {
'name': '东方烟草报',
'base_url': 'https://www.eastobacco.com/',
'start_url': 'https://www.eastobacco.com/',
'article_selector': 'a'
},
}
if platform == 'all':
target_platforms = platforms.values()
else:
target_platforms = [platforms[platform]]
for platform_config in target_platforms:
website, created = Website.objects.get_or_create(
name=platform_config['name'],
defaults={
'base_url': platform_config['base_url'],
'article_list_url': platform_config['start_url'],
'article_selector': platform_config['article_selector']
}
)
# 确保更新已存在的网站对象的配置
if not created:
website.base_url = platform_config['base_url']
website.article_list_url = platform_config['start_url']
website.article_selector = platform_config['article_selector']
website.save()
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
full_site_crawler(platform_config['start_url'], website, max_pages=500)
self.stdout.write(f"完成爬取: {platform_config['name']}")
self.stdout.write(self.style.SUCCESS("东方烟草报所有平台爬取完成"))

View File

@@ -0,0 +1,53 @@
from django.core.management.base import BaseCommand
from core.models import Website
from core.utils import full_site_crawler
class Command(BaseCommand):
help = "全站递归爬取 法治日报平台"
def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all',
choices=['fzrb', 'all'],
help='选择爬取平台: fzrb(法治日报), all(全部)')
def handle(self, *args, **options):
platform = options['platform']
# 法治日报各平台配置
platforms = {
'fzrb': {
'name': '法治日报',
'base_url': 'http://www.legaldaily.com.cn',
'start_url': 'http://www.legaldaily.com.cn',
'article_selector': 'a'
},
}
if platform == 'all':
target_platforms = platforms.values()
else:
target_platforms = [platforms[platform]]
for platform_config in target_platforms:
website, created = Website.objects.get_or_create(
name=platform_config['name'],
defaults={
'base_url': platform_config['base_url'],
'article_list_url': platform_config['start_url'],
'article_selector': platform_config['article_selector']
}
)
# 确保更新已存在的网站对象的配置
if not created:
website.base_url = platform_config['base_url']
website.article_list_url = platform_config['start_url']
website.article_selector = platform_config['article_selector']
website.save()
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
full_site_crawler(platform_config['start_url'], website, max_pages=500)
self.stdout.write(f"完成爬取: {platform_config['name']}")
self.stdout.write(self.style.SUCCESS("法治日报所有平台爬取完成"))

View File

@@ -0,0 +1,54 @@
from django.core.management.base import BaseCommand
from core.models import Website
from core.utils import full_site_crawler
# jimmy.fang-20250815: 取消对光明日报的支持,光明日报反爬,被阻挡
class Command(BaseCommand):
help = "全站递归爬取 光明日报平台"
def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all',
choices=['gmrb', 'all'],
help='选择爬取平台: gmrb(光明日报), all(全部)')
def handle(self, *args, **options):
platform = options['platform']
# 光明日报各平台配置
platforms = {
'gmrb': {
'name': '光明日报',
'base_url': 'https://www.gmw.cn',
'start_url': 'https://www.gmw.cn',
'article_selector': 'a'
},
}
if platform == 'all':
target_platforms = platforms.values()
else:
target_platforms = [platforms[platform]]
for platform_config in target_platforms:
website, created = Website.objects.get_or_create(
name=platform_config['name'],
defaults={
'base_url': platform_config['base_url'],
'article_list_url': platform_config['start_url'],
'article_selector': platform_config['article_selector']
}
)
# 确保更新已存在的网站对象的配置
if not created:
website.base_url = platform_config['base_url']
website.article_list_url = platform_config['start_url']
website.article_selector = platform_config['article_selector']
website.save()
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
full_site_crawler(platform_config['start_url'], website, max_pages=500)
self.stdout.write(f"完成爬取: {platform_config['name']}")
self.stdout.write(self.style.SUCCESS("光明日报所有平台爬取完成"))

View File

@@ -0,0 +1,53 @@
from django.core.management.base import BaseCommand
from core.models import Website
from core.utils import full_site_crawler
class Command(BaseCommand):
help = "全站递归爬取 工人日报平台"
def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all',
choices=['grrb', 'all'],
help='选择爬取平台: grrb(工人日报), all(全部)')
def handle(self, *args, **options):
platform = options['platform']
# 工人日报各平台配置
platforms = {
'grrb': {
'name': '工人日报',
'base_url': 'http://www.workercn.cn',
'start_url': 'http://www.workercn.cn',
'article_selector': 'a'
},
}
if platform == 'all':
target_platforms = platforms.values()
else:
target_platforms = [platforms[platform]]
for platform_config in target_platforms:
website, created = Website.objects.get_or_create(
name=platform_config['name'],
defaults={
'base_url': platform_config['base_url'],
'article_list_url': platform_config['start_url'],
'article_selector': platform_config['article_selector']
}
)
# 确保更新已存在的网站对象的配置
if not created:
website.base_url = platform_config['base_url']
website.article_list_url = platform_config['start_url']
website.article_selector = platform_config['article_selector']
website.save()
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
full_site_crawler(platform_config['start_url'], website, max_pages=500)
self.stdout.write(f"完成爬取: {platform_config['name']}")
self.stdout.write(self.style.SUCCESS("工人日报所有平台爬取完成"))

View File

@@ -0,0 +1,53 @@
from django.core.management.base import BaseCommand
from core.models import Website
from core.utils import full_site_crawler
class Command(BaseCommand):
help = "全站递归爬取 经济日报平台"
def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all',
choices=['jjrb', 'all'],
help='选择爬取平台: jjrb(经济日报), all(全部)')
def handle(self, *args, **options):
platform = options['platform']
# 经济日报各平台配置
platforms = {
'jjrb': {
'name': '经济日报',
'base_url': 'http://www.ce.cn',
'start_url': 'http://www.ce.cn',
'article_selector': 'a'
},
}
if platform == 'all':
target_platforms = platforms.values()
else:
target_platforms = [platforms[platform]]
for platform_config in target_platforms:
website, created = Website.objects.get_or_create(
name=platform_config['name'],
defaults={
'base_url': platform_config['base_url'],
'article_list_url': platform_config['start_url'],
'article_selector': platform_config['article_selector']
}
)
# 确保更新已存在的网站对象的配置
if not created:
website.base_url = platform_config['base_url']
website.article_list_url = platform_config['start_url']
website.article_selector = platform_config['article_selector']
website.save()
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
full_site_crawler(platform_config['start_url'], website, max_pages=500)
self.stdout.write(f"完成爬取: {platform_config['name']}")
self.stdout.write(self.style.SUCCESS("经济日报所有平台爬取完成"))

View File

@@ -0,0 +1,54 @@
# jimmy.fang-20250815: 不支援
from django.core.management.base import BaseCommand
from core.models import Website
from core.utils import full_site_crawler
class Command(BaseCommand):
help = "全站递归爬取 科技日报平台"
def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all',
choices=['kjrb', 'all'],
help='选择爬取平台: kjrb(科技日报), all(全部)')
def handle(self, *args, **options):
platform = options['platform']
# 科技日报各平台配置
platforms = {
'kjrb': {
'name': '科技日报',
'base_url': 'http://digitalpaper.stdaily.com',
'start_url': 'http://digitalpaper.stdaily.com',
'article_selector': 'a'
},
}
if platform == 'all':
target_platforms = platforms.values()
else:
target_platforms = [platforms[platform]]
for platform_config in target_platforms:
website, created = Website.objects.get_or_create(
name=platform_config['name'],
defaults={
'base_url': platform_config['base_url'],
'article_list_url': platform_config['start_url'],
'article_selector': platform_config['article_selector']
}
)
# 确保更新已存在的网站对象的配置
if not created:
website.base_url = platform_config['base_url']
website.article_list_url = platform_config['start_url']
website.article_selector = platform_config['article_selector']
website.save()
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
full_site_crawler(platform_config['start_url'], website, max_pages=500)
self.stdout.write(f"完成爬取: {platform_config['name']}")
self.stdout.write(self.style.SUCCESS("科技日报所有平台爬取完成"))

View File

@@ -0,0 +1,53 @@
from django.core.management.base import BaseCommand
from core.models import Website
from core.utils import full_site_crawler
class Command(BaseCommand):
help = "全站递归爬取 农民日报及其子网站、客户端、新媒体平台"
def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all',
choices=['nmrb', 'all'],
help='选择爬取平台: nmrb(农民日报), all(全部)')
def handle(self, *args, **options):
platform = options['platform']
# 农民日报各平台配置
platforms = {
'nmrb': {
'name': '农民日报',
'base_url': 'http://www.farmer.com.cn',
'start_url': 'http://www.farmer.com.cn',
'article_selector': 'a'
},
}
if platform == 'all':
target_platforms = platforms.values()
else:
target_platforms = [platforms[platform]]
for platform_config in target_platforms:
website, created = Website.objects.get_or_create(
name=platform_config['name'],
defaults={
'base_url': platform_config['base_url'],
'article_list_url': platform_config['start_url'],
'article_selector': platform_config['article_selector']
}
)
# 确保更新已存在的网站对象的配置
if not created:
website.base_url = platform_config['base_url']
website.article_list_url = platform_config['start_url']
website.article_selector = platform_config['article_selector']
website.save()
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
full_site_crawler(platform_config['start_url'], website, max_pages=500)
self.stdout.write(f"完成爬取: {platform_config['name']}")
self.stdout.write(self.style.SUCCESS("农民日报所有平台爬取完成"))

View File

@@ -0,0 +1,53 @@
from django.core.management.base import BaseCommand
from core.models import Website
from core.utils import full_site_crawler
class Command(BaseCommand):
help = "全站递归爬取 解放军报平台"
def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all',
choices=['pla', 'all'],
help='选择爬取平台: pla(解放军报), all(全部)')
def handle(self, *args, **options):
platform = options['platform']
# 解放军报各平台配置
platforms = {
'pla': {
'name': '解放军报',
'base_url': 'https://www.81.cn',
'start_url': 'https://www.81.cn',
'article_selector': 'a'
},
}
if platform == 'all':
target_platforms = platforms.values()
else:
target_platforms = [platforms[platform]]
for platform_config in target_platforms:
website, created = Website.objects.get_or_create(
name=platform_config['name'],
defaults={
'base_url': platform_config['base_url'],
'article_list_url': platform_config['start_url'],
'article_selector': platform_config['article_selector']
}
)
# 确保更新已存在的网站对象的配置
if not created:
website.base_url = platform_config['base_url']
website.article_list_url = platform_config['start_url']
website.article_selector = platform_config['article_selector']
website.save()
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
full_site_crawler(platform_config['start_url'], website, max_pages=500)
self.stdout.write(f"完成爬取: {platform_config['name']}")
self.stdout.write(self.style.SUCCESS("解放军报所有平台爬取完成"))

View File

@@ -0,0 +1,53 @@
from django.core.management.base import BaseCommand
from core.models import Website
from core.utils import full_site_crawler
class Command(BaseCommand):
help = "全站递归爬取 求是杂志平台"
def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all',
choices=['qiushi', 'all'],
help='选择爬取平台: qiushi(求是网), all(全部)')
def handle(self, *args, **options):
platform = options['platform']
# 求是杂志各平台配置
platforms = {
'qiushi': {
'name': '求是网',
'base_url': 'https://www.qstheory.cn',
'start_url': 'https://www.qstheory.cn',
'article_selector': 'a'
},
}
if platform == 'all':
target_platforms = platforms.values()
else:
target_platforms = [platforms[platform]]
for platform_config in target_platforms:
website, created = Website.objects.get_or_create(
name=platform_config['name'],
defaults={
'base_url': platform_config['base_url'],
'article_list_url': platform_config['start_url'],
'article_selector': platform_config['article_selector']
}
)
# 确保更新已存在的网站对象的配置
if not created:
website.base_url = platform_config['base_url']
website.article_list_url = platform_config['start_url']
website.article_selector = platform_config['article_selector']
website.save()
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
full_site_crawler(platform_config['start_url'], website, max_pages=500)
self.stdout.write(f"完成爬取: {platform_config['name']}")
self.stdout.write(self.style.SUCCESS("求是杂志所有平台爬取完成"))

View File

@@ -0,0 +1,53 @@
from django.core.management.base import BaseCommand
from core.models import Website
from core.utils import full_site_crawler
class Command(BaseCommand):
help = "全站递归爬取 旗帜网平台"
def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all',
choices=['qizhi', 'all'],
help='选择爬取平台: qizhi(旗帜网), all(全部)')
def handle(self, *args, **options):
platform = options['platform']
# 旗帜网各平台配置
platforms = {
'qizhi': {
'name': '旗帜网',
'base_url': 'http://www.qizhiwang.org.cn',
'start_url': 'http://www.qizhiwang.org.cn',
'article_selector': 'a[href^="/"]' # 修改选择器以更好地匹配文章链接
},
}
if platform == 'all':
target_platforms = platforms.values()
else:
target_platforms = [platforms[platform]]
for platform_config in target_platforms:
website, created = Website.objects.get_or_create(
name=platform_config['name'],
defaults={
'base_url': platform_config['base_url'],
'article_list_url': platform_config['start_url'],
'article_selector': platform_config['article_selector']
}
)
# 确保更新已存在的网站对象的配置
if not created:
website.base_url = platform_config['base_url']
website.article_list_url = platform_config['start_url']
website.article_selector = platform_config['article_selector']
website.save()
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
full_site_crawler(platform_config['start_url'], website, max_pages=500)
self.stdout.write(f"完成爬取: {platform_config['name']}")
self.stdout.write(self.style.SUCCESS("旗帜网所有平台爬取完成"))

View File

@@ -0,0 +1,65 @@
from django.core.management.base import BaseCommand
from core.models import Website
from core.utils import full_site_crawler
class Command(BaseCommand):
help = "全站递归爬取 人民日报及其子网站、客户端、新媒体平台"
def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all',
choices=['peopleapp', 'people', 'paper', 'all'],
help='选择爬取平台: peopleapp(客户端), people(人民网), paper(报纸), all(全部)')
def handle(self, *args, **options):
platform = options['platform']
# 人民日报各平台配置
platforms = {
'peopleapp': {
'name': '人民日报客户端',
'base_url': 'https://www.peopleapp.com',
'start_url': 'https://www.peopleapp.com/home',
'article_selector': 'a'
},
'people': {
'name': '人民网',
'base_url': 'https://www.people.com.cn',
'start_url': 'https://www.people.com.cn',
'article_selector': 'a'
},
'paper': {
'name': '人民日报报纸',
'base_url': 'http://paper.people.com.cn',
'start_url': 'http://paper.people.com.cn',
'article_selector': 'a'
}
}
if platform == 'all':
target_platforms = platforms.values()
else:
target_platforms = [platforms[platform]]
for platform_config in target_platforms:
website, created = Website.objects.get_or_create(
name=platform_config['name'],
defaults={
'base_url': platform_config['base_url'],
'article_list_url': platform_config['start_url'],
'article_selector': platform_config['article_selector']
}
)
# 确保更新已存在的网站对象的配置
if not created:
website.base_url = platform_config['base_url']
website.article_list_url = platform_config['start_url']
website.article_selector = platform_config['article_selector']
website.save()
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
full_site_crawler(platform_config['start_url'], website, max_pages=500)
self.stdout.write(f"完成爬取: {platform_config['name']}")
self.stdout.write(self.style.SUCCESS("人民日报所有平台爬取完成"))

View File

@@ -0,0 +1,53 @@
from django.core.management.base import BaseCommand
from core.models import Website
from core.utils import full_site_crawler
class Command(BaseCommand):
help = "全站递归爬取 人民政协网平台"
def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all',
choices=['rmzxb', 'all'],
help='选择爬取平台: rmzxb(人民政协网), all(全部)')
def handle(self, *args, **options):
platform = options['platform']
# 人民政协网各平台配置
platforms = {
'rmzxb': {
'name': '人民政协网',
'base_url': 'https://www.rmzxw.com.cn',
'start_url': 'https://www.rmzxw.com.cn',
'article_selector': 'a'
},
}
if platform == 'all':
target_platforms = platforms.values()
else:
target_platforms = [platforms[platform]]
for platform_config in target_platforms:
website, created = Website.objects.get_or_create(
name=platform_config['name'],
defaults={
'base_url': platform_config['base_url'],
'article_list_url': platform_config['start_url'],
'article_selector': platform_config['article_selector']
}
)
# 确保更新已存在的网站对象的配置
if not created:
website.base_url = platform_config['base_url']
website.article_list_url = platform_config['start_url']
website.article_selector = platform_config['article_selector']
website.save()
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
full_site_crawler(platform_config['start_url'], website, max_pages=500)
self.stdout.write(f"完成爬取: {platform_config['name']}")
self.stdout.write(self.style.SUCCESS("人民政协网所有平台爬取完成"))

View File

@@ -1,18 +1,54 @@
from django.core.management.base import BaseCommand
from core.models import Website
from core.utils import crawl_xinhua_list
from core.utils import full_site_crawler
class Command(BaseCommand):
help = '批量爬取新华网文章'
help = "全站递归爬取 新华社平台"
def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all',
choices=['news', 'all'],
help='选择爬取平台: news(新华网), all(全部)')
def handle(self, *args, **options):
list_url = "https://www.news.cn/legal/index.html"
try:
website = Website.objects.get(base_url="https://www.news.cn/")
except Website.DoesNotExist:
self.stdout.write(self.style.ERROR("网站 https://www.news.cn/ 不存在,请先后台添加"))
return
platform = options['platform']
self.stdout.write(f"开始爬取文章列表页: {list_url}")
crawl_xinhua_list(list_url, website)
self.stdout.write(self.style.SUCCESS("批量爬取完成"))
# 新华社各平台配置
platforms = {
'news': {
'name': '新华网',
'base_url': 'https://www.news.cn',
'start_url': 'https://www.news.cn',
'article_selector': 'a'
},
}
if platform == 'all':
target_platforms = platforms.values()
else:
target_platforms = [platforms[platform]]
for platform_config in target_platforms:
website, created = Website.objects.get_or_create(
name=platform_config['name'],
defaults={
'base_url': platform_config['base_url'],
'article_list_url': platform_config['start_url'],
'article_selector': platform_config['article_selector']
}
)
# 确保更新已存在的网站对象的配置
if not created:
website.base_url = platform_config['base_url']
website.article_list_url = platform_config['start_url']
website.article_selector = platform_config['article_selector']
website.save()
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
full_site_crawler(platform_config['start_url'], website, max_pages=500)
self.stdout.write(f"完成爬取: {platform_config['name']}")
self.stdout.write(self.style.SUCCESS("新华社所有平台爬取完成"))

View File

@@ -0,0 +1,53 @@
from django.core.management.base import BaseCommand
from core.models import Website
from core.utils import full_site_crawler
class Command(BaseCommand):
help = "全站递归爬取 学习强国平台"
def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all',
choices=['xuexi', 'all'],
help='选择爬取平台: xuexi(学习强国主站), all(全部)')
def handle(self, *args, **options):
platform = options['platform']
# 学习强国各平台配置
platforms = {
'xuexi': {
'name': '学习强国',
'base_url': 'https://www.xuexi.cn',
'start_url': 'https://www.xuexi.cn',
'article_selector': 'a'
},
}
if platform == 'all':
target_platforms = platforms.values()
else:
target_platforms = [platforms[platform]]
for platform_config in target_platforms:
website, created = Website.objects.get_or_create(
name=platform_config['name'],
defaults={
'base_url': platform_config['base_url'],
'article_list_url': platform_config['start_url'],
'article_selector': platform_config['article_selector']
}
)
# 确保更新已存在的网站对象的配置
if not created:
website.base_url = platform_config['base_url']
website.article_list_url = platform_config['start_url']
website.article_selector = platform_config['article_selector']
website.save()
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
full_site_crawler(platform_config['start_url'], website, max_pages=500)
self.stdout.write(f"完成爬取: {platform_config['name']}")
self.stdout.write(self.style.SUCCESS("学习强国所有平台爬取完成"))

View File

@@ -0,0 +1,53 @@
from django.core.management.base import BaseCommand
from core.models import Website
from core.utils import full_site_crawler
class Command(BaseCommand):
help = "全站递归爬取 学习时报平台"
def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all',
choices=['xxsb', 'all'],
help='选择爬取平台: xxsb(学习时报),all(全部)')
def handle(self, *args, **options):
platform = options['platform']
# 学习时报各平台配置
platforms = {
'xxsb': {
'name': '学习时报',
'base_url': 'http://www.studytimes.cn',
'start_url': 'http://www.studytimes.cn',
'article_selector': 'a'
},
}
if platform == 'all':
target_platforms = platforms.values()
else:
target_platforms = [platforms[platform]]
for platform_config in target_platforms:
website, created = Website.objects.get_or_create(
name=platform_config['name'],
defaults={
'base_url': platform_config['base_url'],
'article_list_url': platform_config['start_url'],
'article_selector': platform_config['article_selector']
}
)
# 确保更新已存在的网站对象的配置
if not created:
website.base_url = platform_config['base_url']
website.article_list_url = platform_config['start_url']
website.article_selector = platform_config['article_selector']
website.save()
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
full_site_crawler(platform_config['start_url'], website, max_pages=500)
self.stdout.write(f"完成爬取: {platform_config['name']}")
self.stdout.write(self.style.SUCCESS("学习时报所有平台爬取完成"))

View File

@@ -0,0 +1,54 @@
from django.core.management.base import BaseCommand
from core.models import Website
from core.utils import full_site_crawler
class Command(BaseCommand):
help = "全站递归爬取 中国妇女报平台"
def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all',
choices=['zgfnb', 'all'],
help='选择爬取平台: zgfnb(中国妇女报), all(全部)')
def handle(self, *args, **options):
platform = options['platform']
# 中国妇女报各平台配置
platforms = {
'zgfnb': {
'name': '中国妇女报',
'base_url': 'http://www.cnwomen.com.cn',
'start_url': 'http://www.cnwomen.com.cn',
'article_selector': 'a'
},
}
if platform == 'all':
target_platforms = platforms.values()
else:
target_platforms = [platforms[platform]]
for platform_config in target_platforms:
website, created = Website.objects.get_or_create(
name=platform_config['name'],
defaults={
'base_url': platform_config['base_url'],
'article_list_url': platform_config['start_url'],
'article_selector': platform_config['article_selector']
}
)
# 确保更新已存在的网站对象的配置
if not created:
website.base_url = platform_config['base_url']
website.article_list_url = platform_config['start_url']
website.article_selector = platform_config['article_selector']
website.save()
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
full_site_crawler(platform_config['start_url'], website, max_pages=500)
self.stdout.write(f"完成爬取: {platform_config['name']}")
self.stdout.write(self.style.SUCCESS("中国妇女报所有平台爬取完成"))

View File

@@ -0,0 +1,53 @@
from django.core.management.base import BaseCommand
from core.models import Website
from core.utils import full_site_crawler
class Command(BaseCommand):
help = "全站递归爬取 中国纪检监察报平台"
def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all',
choices=['zgjwjc', 'all'],
help='选择爬取平台: zgjwjc(中国纪检监察报),all(全部)')
def handle(self, *args, **options):
platform = options['platform']
# 中国纪检监察报各平台配置
platforms = {
'zgjwjc': {
'name': '中国纪检监察报',
'base_url': 'https://jjjcb.ccdi.gov.cn',
'start_url': 'https://jjjcb.ccdi.gov.cn',
'article_selector': 'a'
},
}
if platform == 'all':
target_platforms = platforms.values()
else:
target_platforms = [platforms[platform]]
for platform_config in target_platforms:
website, created = Website.objects.get_or_create(
name=platform_config['name'],
defaults={
'base_url': platform_config['base_url'],
'article_list_url': platform_config['start_url'],
'article_selector': platform_config['article_selector']
}
)
# 确保更新已存在的网站对象的配置
if not created:
website.base_url = platform_config['base_url']
website.article_list_url = platform_config['start_url']
website.article_selector = platform_config['article_selector']
website.save()
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
full_site_crawler(platform_config['start_url'], website, max_pages=500)
self.stdout.write(f"完成爬取: {platform_config['name']}")
self.stdout.write(self.style.SUCCESS("中国纪检监察报所有平台爬取完成"))

View File

@@ -0,0 +1,54 @@
from django.core.management.base import BaseCommand
from core.models import Website
from core.utils import full_site_crawler
class Command(BaseCommand):
help = "全站递归爬取 中国青年报平台"
def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all',
choices=['zgqnb', 'all'],
help='选择爬取平台: zgqnb(中国青年报), all(全部)')
def handle(self, *args, **options):
platform = options['platform']
# 中国青年报各平台配置
platforms = {
'zgqnb': {
'name': '中国青年报',
'base_url': 'https://www.cyol.com',
'start_url': 'https://www.cyol.com',
'article_selector': 'a'
},
}
if platform == 'all':
target_platforms = platforms.values()
else:
target_platforms = [platforms[platform]]
for platform_config in target_platforms:
website, created = Website.objects.get_or_create(
name=platform_config['name'],
defaults={
'base_url': platform_config['base_url'],
'article_list_url': platform_config['start_url'],
'article_selector': platform_config['article_selector']
}
)
# 确保更新已存在的网站对象的配置
if not created:
website.base_url = platform_config['base_url']
website.article_list_url = platform_config['start_url']
website.article_selector = platform_config['article_selector']
website.save()
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
full_site_crawler(platform_config['start_url'], website, max_pages=500)
self.stdout.write(f"完成爬取: {platform_config['name']}")
self.stdout.write(self.style.SUCCESS("中国青年报所有平台爬取完成"))

View File

@@ -0,0 +1,222 @@
from django.core.management.base import BaseCommand
from core.models import Article, Website
import json
import csv
import os
from django.conf import settings
import zipfile
from django.utils import timezone
from bs4 import BeautifulSoup
# 添加python-docx库支持
import io
from docx import Document
class Command(BaseCommand):
help = '导出文章及相关的媒体文件(图片、视频等)'
def add_arguments(self, parser):
parser.add_argument('--format', type=str, default='docx', help='导出格式: json、csv 或 docx')
parser.add_argument('--website', type=str, help='指定网站名称导出特定网站的文章')
parser.add_argument('--output', type=str, default='', help='输出文件路径')
# 修改默认值为True使包含媒体文件成为默认行为
parser.add_argument('--include-media', action='store_true', default=True, help='包含媒体文件')
# 添加参数控制是否打包成zip
parser.add_argument('--no-zip', action='store_true', help='不打包成zip文件')
def handle(self, *args, **options):
format_type = options['format'].lower()
website_name = options['website']
output_path = options['output']
include_media = options['include_media']
no_zip = options['no_zip']
# 获取文章查询集
articles = Article.objects.all()
if website_name:
try:
website = Website.objects.get(name=website_name)
articles = articles.filter(website=website)
except Website.DoesNotExist:
self.stdout.write(self.style.ERROR(f'网站 "{website_name}" 不存在'))
return
if not articles.exists():
self.stdout.write(self.style.WARNING('没有找到文章'))
return
# 准备导出数据
articles_data = []
media_files = []
for article in articles:
article_data = {
'id': article.id,
'title': article.title,
'website': article.website.name,
'url': article.url,
'pub_date': article.pub_date.isoformat() if article.pub_date else None,
'content': article.content,
'created_at': article.created_at.isoformat(),
'media_files': article.media_files
}
articles_data.append(article_data)
# 收集媒体文件路径
if include_media:
for media_path in article.media_files:
full_path = os.path.join(settings.MEDIA_ROOT, media_path)
if os.path.exists(full_path):
media_files.append(full_path)
# 确定输出路径
if not output_path:
timestamp = timezone.now().strftime('%Y%m%d_%H%M%S')
# 默认导出为zip格式
output_path = f'articles_export_{timestamp}.zip'
# 执行导出
# 如果需要包含媒体文件或格式为docx则默认打包成zip
if include_media or format_type == 'docx':
if no_zip:
if format_type == 'docx':
self.export_as_word(articles_data, output_path)
elif format_type == 'json':
self.export_as_json(articles_data, output_path)
elif format_type == 'csv':
self.export_as_csv(articles_data, output_path)
else:
self.export_with_media(articles_data, media_files, output_path, format_type)
else:
if format_type == 'json':
self.export_as_json(articles_data, output_path)
elif format_type == 'csv':
self.export_as_csv(articles_data, output_path)
elif format_type == 'docx':
self.export_as_word(articles_data, output_path)
else:
self.stdout.write(self.style.ERROR('不支持的格式,仅支持 json、csv 或 docx'))
return
self.stdout.write(self.style.SUCCESS(f'成功导出 {len(articles_data)} 篇文章到 {output_path}'))
def export_as_json(self, articles_data, output_path):
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(articles_data, f, ensure_ascii=False, indent=2)
def export_as_csv(self, articles_data, output_path):
if not articles_data:
return
# 打开CSV文件
with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['id', 'title', 'website', 'url', 'pub_date', 'content', 'created_at', 'media_files']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for article_data in articles_data:
# 将列表转换为字符串以便在CSV中存储
article_data['media_files'] = ';'.join(article_data['media_files']) if article_data[
'media_files'] else ''
writer.writerow(article_data)
# 添加Word格式导出方法
def export_as_word(self, articles_data, output_path):
try:
# 创建一个新的Word文档
document = Document()
document.add_heading('文章导出', 0)
for article_data in articles_data:
# 添加文章标题
document.add_heading(article_data['title'], level=1)
# 添加文章信息
document.add_paragraph(f"网站: {article_data['website']}")
document.add_paragraph(f"URL: {article_data['url']}")
document.add_paragraph(f"发布时间: {article_data['pub_date']}")
document.add_paragraph(f"创建时间: {article_data['created_at']}")
# 添加内容标题
document.add_heading('内容:', level=2)
# 处理HTML内容移除标签
soup = BeautifulSoup(article_data['content'], 'html.parser')
content_text = soup.get_text()
document.add_paragraph(content_text)
# 添加分页符分隔文章
document.add_page_break()
# 保存文档
document.save(output_path)
self.stdout.write(self.style.SUCCESS(f'成功导出为Word格式: {output_path}'))
except Exception as e:
self.stdout.write(self.style.ERROR(f'导出Word格式失败: {e}'))
def export_with_media(self, articles_data, media_files, output_path, format_type):
# 创建ZIP文件
with zipfile.ZipFile(output_path, 'w') as zipf:
# 为每篇文章创建独立的文件夹
for article_data in articles_data:
article_folder = f"article_{article_data['id']}_{article_data['title']}"
# 限制文件夹名称长度并移除非法字符
article_folder = article_folder[:50].rstrip()
article_folder = "".join(c for c in article_folder if c.isalnum() or c in (' ','_','-')).rstrip()
# 添加文章数据文件
if format_type == 'docx':
# 创建Word文档并保存到ZIP
data_filename = f'{article_folder}/article.docx'
try:
# 创建文章信息Word文档
doc = Document()
doc.add_heading(article_data['title'], 0)
# 添加文章信息
doc.add_paragraph(f"网站: {article_data['website']}")
doc.add_paragraph(f"URL: {article_data['url']}")
doc.add_paragraph(f"发布时间: {article_data['pub_date']}")
doc.add_paragraph(f"创建时间: {article_data['created_at']}")
# 添加内容标题
doc.add_heading('内容:', level=1)
# 处理HTML内容
soup = BeautifulSoup(article_data['content'], 'html.parser')
content_text = soup.get_text()
doc.add_paragraph(content_text)
# 将文档保存到内存中
doc_buffer = io.BytesIO()
doc.save(doc_buffer)
doc_buffer.seek(0)
# 将文档添加到ZIP文件
zipf.writestr(data_filename, doc_buffer.getvalue())
except Exception as e:
error_msg = f"错误无法生成文章Word文档 - {str(e)}"
zipf.writestr(data_filename, error_msg)
# 添加媒体文件到文章的media子文件夹
if article_data['media_files']:
for media_file in article_data['media_files']:
try:
full_path = os.path.join(settings.MEDIA_ROOT, media_file)
if os.path.exists(full_path):
# 添加媒体文件到ZIP中的media子文件夹
media_filename = f"{article_folder}/media/{os.path.basename(media_file)}"
zipf.write(full_path, media_filename)
else:
# 如果是URL格式的媒体文件
if media_file.startswith('http'):
import requests
from io import BytesIO
response = requests.get(media_file, timeout=10)
image_stream = BytesIO(response.content)
media_filename = f"{article_folder}/media/{os.path.basename(media_file)}"
zipf.writestr(media_filename, image_stream.getvalue())
except Exception as e:
# 错误处理,跳过无法添加的文件
pass

View File

@@ -0,0 +1,45 @@
# Generated by Django 5.1 on 2025-09-23 19:28
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0001_initial'),
]
operations = [
migrations.CreateModel(
name='CrawlTask',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('name', models.CharField(max_length=200, verbose_name='任务名称')),
('task_type', models.CharField(choices=[('keyword', '关键词搜索'), ('historical', '历史文章'), ('full_site', '全站爬取')], default='keyword', max_length=20, verbose_name='任务类型')),
('keyword', models.CharField(blank=True, max_length=200, null=True, verbose_name='搜索关键词')),
('websites', models.JSONField(default=list, verbose_name='目标网站')),
('start_date', models.DateField(blank=True, null=True, verbose_name='开始日期')),
('end_date', models.DateField(blank=True, null=True, verbose_name='结束日期')),
('max_pages', models.IntegerField(default=10, verbose_name='最大页数')),
('max_articles', models.IntegerField(default=100, verbose_name='最大文章数')),
('status', models.CharField(choices=[('pending', '等待中'), ('running', '运行中'), ('completed', '已完成'), ('failed', '失败'), ('cancelled', '已取消')], default='pending', max_length=20, verbose_name='状态')),
('progress', models.IntegerField(default=0, verbose_name='进度百分比')),
('current_website', models.CharField(blank=True, max_length=100, null=True, verbose_name='当前网站')),
('current_action', models.CharField(blank=True, max_length=200, null=True, verbose_name='当前操作')),
('total_articles', models.IntegerField(default=0, verbose_name='总文章数')),
('success_count', models.IntegerField(default=0, verbose_name='成功数')),
('failed_count', models.IntegerField(default=0, verbose_name='失败数')),
('created_at', models.DateTimeField(auto_now_add=True, verbose_name='创建时间')),
('started_at', models.DateTimeField(blank=True, null=True, verbose_name='开始时间')),
('completed_at', models.DateTimeField(blank=True, null=True, verbose_name='完成时间')),
('error_message', models.TextField(blank=True, null=True, verbose_name='错误信息')),
('result_details', models.JSONField(blank=True, default=dict, verbose_name='结果详情')),
('created_by', models.CharField(blank=True, max_length=100, null=True, verbose_name='创建者')),
],
options={
'verbose_name': '爬取任务',
'verbose_name_plural': '爬取任务',
'ordering': ['-created_at'],
},
),
]

View File

@@ -0,0 +1,22 @@
# Generated by Django 5.1 on 2025-09-23 19:34
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0002_crawltask'),
]
operations = [
migrations.RemoveField(
model_name='crawltask',
name='websites',
),
migrations.AddField(
model_name='crawltask',
name='websites',
field=models.ManyToManyField(blank=True, to='core.website', verbose_name='目标网站'),
),
]

View File

@@ -0,0 +1,28 @@
# Generated by Django 5.1 on 2025-09-25 02:16
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0003_remove_crawltask_websites_crawltask_websites'),
]
operations = [
migrations.AddField(
model_name='crawltask',
name='execution_count',
field=models.IntegerField(default=0, verbose_name='执行次数'),
),
migrations.AddField(
model_name='crawltask',
name='execution_history',
field=models.JSONField(blank=True, default=list, verbose_name='执行历史'),
),
migrations.AddField(
model_name='crawltask',
name='last_execution_at',
field=models.DateTimeField(blank=True, null=True, verbose_name='最后执行时间'),
),
]

View File

@@ -1,4 +1,7 @@
from django.db import models
from django.utils import timezone
import json
class Website(models.Model):
name = models.CharField(max_length=100, unique=True)
@@ -24,3 +27,147 @@ class Article(models.Model):
def __str__(self):
return self.title
class CrawlTask(models.Model):
"""爬取任务模型"""
TASK_STATUS_CHOICES = [
('pending', '等待中'),
('running', '运行中'),
('completed', '已完成'),
('failed', '失败'),
('cancelled', '已取消'),
]
TASK_TYPE_CHOICES = [
('keyword', '关键词搜索'),
('historical', '历史文章'),
('full_site', '全站爬取'),
]
name = models.CharField(max_length=200, verbose_name="任务名称")
task_type = models.CharField(max_length=20, choices=TASK_TYPE_CHOICES, default='keyword', verbose_name="任务类型")
keyword = models.CharField(max_length=200, blank=True, null=True, verbose_name="搜索关键词")
websites = models.ManyToManyField(Website, blank=True, verbose_name="目标网站")
start_date = models.DateField(blank=True, null=True, verbose_name="开始日期")
end_date = models.DateField(blank=True, null=True, verbose_name="结束日期")
max_pages = models.IntegerField(default=10, verbose_name="最大页数")
max_articles = models.IntegerField(default=100, verbose_name="最大文章数")
status = models.CharField(max_length=20, choices=TASK_STATUS_CHOICES, default='pending', verbose_name="状态")
progress = models.IntegerField(default=0, verbose_name="进度百分比")
current_website = models.CharField(max_length=100, blank=True, null=True, verbose_name="当前网站")
current_action = models.CharField(max_length=200, blank=True, null=True, verbose_name="当前操作")
total_articles = models.IntegerField(default=0, verbose_name="总文章数")
success_count = models.IntegerField(default=0, verbose_name="成功数")
failed_count = models.IntegerField(default=0, verbose_name="失败数")
created_at = models.DateTimeField(auto_now_add=True, verbose_name="创建时间")
started_at = models.DateTimeField(blank=True, null=True, verbose_name="开始时间")
completed_at = models.DateTimeField(blank=True, null=True, verbose_name="完成时间")
error_message = models.TextField(blank=True, null=True, verbose_name="错误信息")
result_details = models.JSONField(default=dict, blank=True, verbose_name="结果详情")
created_by = models.CharField(max_length=100, blank=True, null=True, verbose_name="创建者")
# 执行历史字段
execution_count = models.IntegerField(default=0, verbose_name="执行次数")
last_execution_at = models.DateTimeField(blank=True, null=True, verbose_name="最后执行时间")
execution_history = models.JSONField(default=list, blank=True, verbose_name="执行历史")
class Meta:
verbose_name = "爬取任务"
verbose_name_plural = "爬取任务"
ordering = ['-created_at']
def __str__(self):
return f"{self.name} ({self.get_status_display()})"
def get_websites_display(self):
"""获取网站列表的显示文本"""
try:
websites = self.websites.all()
if not websites:
return "所有网站"
# 确保网站名称是字符串并可以被join处理
website_names = [str(w.name) for w in websites if w.name]
return ", ".join(website_names) if website_names else "所有网站"
except Exception:
# 如果出现任何异常,返回默认值
return "所有网站"
def get_duration(self):
"""获取任务执行时长"""
if not self.started_at:
return None
end_time = self.completed_at or timezone.now()
return end_time - self.started_at
def is_running(self):
"""判断任务是否正在运行"""
return self.status == 'running'
def can_cancel(self):
"""判断任务是否可以取消"""
return self.status in ['pending', 'running']
def get_progress_display(self):
"""获取进度显示文本"""
if self.status == 'pending':
return "等待开始"
elif self.status == 'running':
if self.current_website and self.current_action:
return f"正在处理 {self.current_website}: {self.current_action}"
return f"运行中 ({self.progress}%)"
elif self.status == 'completed':
return f"已完成 ({self.success_count}/{self.total_articles})"
elif self.status == 'failed':
return f"失败: {self.error_message[:50]}..." if self.error_message else "失败"
elif self.status == 'cancelled':
return "已取消"
return "未知状态"
def add_execution_record(self, status, started_at=None, completed_at=None, error_message=None):
"""添加执行记录"""
if not started_at:
started_at = timezone.now()
execution_record = {
'execution_id': len(self.execution_history) + 1,
'started_at': started_at.isoformat() if started_at else None,
'completed_at': completed_at.isoformat() if completed_at else None,
'status': status,
'error_message': error_message,
'success_count': self.success_count,
'failed_count': self.failed_count,
'total_articles': self.total_articles
}
# 更新执行历史
if not self.execution_history:
self.execution_history = []
self.execution_history.append(execution_record)
# 更新执行次数和最后执行时间
self.execution_count += 1
self.last_execution_at = started_at
# 只保留最近10次执行记录
if len(self.execution_history) > 10:
self.execution_history = self.execution_history[-10:]
self.save()
def get_execution_summary(self):
"""获取执行摘要"""
if not self.execution_history:
return "暂无执行记录"
total_executions = len(self.execution_history)
successful_executions = len([r for r in self.execution_history if r['status'] == 'completed'])
failed_executions = len([r for r in self.execution_history if r['status'] == 'failed'])
return f"执行 {total_executions} 次,成功 {successful_executions} 次,失败 {failed_executions}"

View File

@@ -0,0 +1,123 @@
/**
* 爬取任务操作JavaScript
*/
function startTask(taskId) {
if (confirm('确定要启动这个任务吗?')) {
fetch(`/admin/core/crawltask/${taskId}/start/`, {
method: 'POST',
headers: {
'X-CSRFToken': getCookie('csrftoken'),
'Content-Type': 'application/x-www-form-urlencoded',
},
})
.then(response => {
if (response.ok) {
location.reload();
} else {
alert('启动任务失败');
}
})
.catch(error => {
console.error('Error:', error);
alert('启动任务失败');
});
}
}
function cancelTask(taskId) {
if (confirm('确定要取消这个任务吗?')) {
fetch(`/admin/core/crawltask/${taskId}/cancel/`, {
method: 'POST',
headers: {
'X-CSRFToken': getCookie('csrftoken'),
'Content-Type': 'application/x-www-form-urlencoded',
},
})
.then(response => {
if (response.ok) {
// 显示取消中的提示
const cancelButton = document.querySelector(`a[href="javascript:void(0)"][onclick="cancelTask(${taskId})"]`);
if (cancelButton) {
cancelButton.textContent = '取消中...';
cancelButton.style.pointerEvents = 'none';
cancelButton.style.opacity = '0.5';
}
// 5秒后刷新页面以查看状态更新
setTimeout(() => location.reload(), 2000);
} else {
alert('取消任务失败');
}
})
.catch(error => {
console.error('Error:', error);
alert('取消任务失败');
});
}
}
function rerunTask(taskId) {
if (confirm('确定要重新执行这个任务吗?这将重置任务状态并重新开始爬取。')) {
fetch(`/admin/core/crawltask/${taskId}/rerun/`, {
method: 'POST',
headers: {
'X-CSRFToken': getCookie('csrftoken'),
'Content-Type': 'application/x-www-form-urlencoded',
},
})
.then(response => {
if (response.ok) {
// 显示重新执行中的提示
const rerunButton = document.querySelector(`a[href="javascript:void(0)"][onclick="rerunTask(${taskId})"]`);
if (rerunButton) {
rerunButton.textContent = '重新执行中...';
rerunButton.style.pointerEvents = 'none';
rerunButton.style.opacity = '0.5';
}
// 2秒后刷新页面以查看状态更新
setTimeout(() => location.reload(), 2000);
} else {
alert('重新执行任务失败');
}
})
.catch(error => {
console.error('Error:', error);
alert('重新执行任务失败');
});
}
}
function viewResults(taskId) {
window.open(`/admin/core/crawltask/${taskId}/results/`, '_blank');
}
function getCookie(name) {
let cookieValue = null;
if (document.cookie && document.cookie !== '') {
const cookies = document.cookie.split(';');
for (let i = 0; i < cookies.length; i++) {
const cookie = cookies[i].trim();
if (cookie.substring(0, name.length + 1) === (name + '=')) {
cookieValue = decodeURIComponent(cookie.substring(name.length + 1));
break;
}
}
}
return cookieValue;
}
// 自动刷新运行中的任务状态
function autoRefreshRunningTasks() {
const runningTasks = document.querySelectorAll('[data-task-status="running"]');
if (runningTasks.length > 0) {
// 每30秒刷新一次页面
setTimeout(() => {
location.reload();
}, 30000);
}
}
// 页面加载完成后执行
document.addEventListener('DOMContentLoaded', function() {
autoRefreshRunningTasks();
});

474
core/task_executor.py Normal file
View File

@@ -0,0 +1,474 @@
"""
爬取任务执行器
负责执行爬取任务并更新任务状态
"""
import threading
import time
from django.utils import timezone
from django.db import transaction
from core.models import CrawlTask
from core.utils import crawl_by_keyword, crawl_historical_articles, full_site_crawler, WEBSITE_CRAWL_CONFIGS
class TaskExecutor:
"""任务执行器"""
def __init__(self):
self.running_tasks = {}
self.cancelled_tasks = set() # 添加已取消任务的集合
self.lock = threading.Lock()
def start_task(self, task_id, rerun=False):
"""启动任务"""
with self.lock:
if task_id in self.running_tasks:
return False, "任务已在运行中"
try:
task = CrawlTask.objects.get(id=task_id)
# 检查任务状态
if not rerun and task.status != 'pending':
return False, "任务状态不允许启动"
# 如果是重新执行,检查任务是否已完成或失败
if rerun and task.status not in ['completed', 'failed', 'cancelled']:
return False, "只有已完成、失败或已取消的任务可以重新执行"
# 重置任务状态(如果是重新执行)
if rerun:
task.status = 'running'
task.started_at = timezone.now()
task.completed_at = None
task.error_message = None
task.progress = 0
task.current_website = None
task.current_action = None
task.total_articles = 0
task.success_count = 0
task.failed_count = 0
task.result_details = {}
else:
# 更新任务状态
task.status = 'running'
task.started_at = timezone.now()
task.save()
# 确保任务不在取消集合中
self.cancelled_tasks.discard(task_id)
# 启动后台线程执行任务
thread = threading.Thread(target=self._execute_task, args=(task_id,))
thread.daemon = True
thread.start()
self.running_tasks[task_id] = thread
return True, "任务已启动" + ("(重新执行)" if rerun else "")
except CrawlTask.DoesNotExist:
return False, "任务不存在"
except Exception as e:
return False, f"启动任务失败: {e}"
def rerun_task(self, task_id):
"""重新执行任务"""
return self.start_task(task_id, rerun=True)
def cancel_task(self, task_id):
"""取消任务"""
with self.lock:
# 将任务标记为已取消
self.cancelled_tasks.add(task_id)
if task_id in self.running_tasks:
# 标记任务为取消状态
try:
task = CrawlTask.objects.get(id=task_id)
task.status = 'cancelled'
task.completed_at = timezone.now()
task.save()
# 记录执行历史
task.add_execution_record(
status='cancelled',
started_at=task.started_at,
completed_at=task.completed_at,
error_message='任务被取消'
)
# 移除运行中的任务
del self.running_tasks[task_id]
return True, "任务已取消"
except CrawlTask.DoesNotExist:
return False, "任务不存在"
else:
# 即使任务不在运行中,也标记为已取消
try:
task = CrawlTask.objects.get(id=task_id)
if task.status in ['pending', 'running']:
task.status = 'cancelled'
task.completed_at = timezone.now()
task.save()
# 记录执行历史
task.add_execution_record(
status='cancelled',
started_at=task.started_at,
completed_at=task.completed_at,
error_message='任务被取消'
)
return True, "任务已取消"
except CrawlTask.DoesNotExist:
pass
return False, "任务未在运行中"
def is_task_cancelled(self, task_id):
"""检查任务是否已被取消"""
with self.lock:
return task_id in self.cancelled_tasks
def _execute_task(self, task_id):
"""执行任务的核心逻辑"""
try:
task = CrawlTask.objects.get(id=task_id)
# 检查任务是否已被取消
if self.is_task_cancelled(task_id):
self._mark_task_cancelled(task_id)
return
# 根据任务类型执行不同的爬取逻辑
if task.task_type == 'keyword':
self._execute_keyword_task(task)
elif task.task_type == 'historical':
self._execute_historical_task(task)
elif task.task_type == 'full_site':
self._execute_full_site_task(task)
else:
raise ValueError(f"不支持的任务类型: {task.task_type}")
# 检查任务是否已被取消
if self.is_task_cancelled(task_id):
self._mark_task_cancelled(task_id)
return
# 任务完成
with transaction.atomic():
task = CrawlTask.objects.select_for_update().get(id=task_id)
task.status = 'completed'
task.completed_at = timezone.now()
task.progress = 100
task.save()
# 记录执行历史
task.add_execution_record(
status='completed',
started_at=task.started_at,
completed_at=task.completed_at
)
except Exception as e:
# 检查任务是否已被取消
if self.is_task_cancelled(task_id):
self._mark_task_cancelled(task_id)
return
# 任务失败
try:
with transaction.atomic():
task = CrawlTask.objects.select_for_update().get(id=task_id)
task.status = 'failed'
task.completed_at = timezone.now()
task.error_message = str(e)
task.save()
# 记录执行历史
task.add_execution_record(
status='failed',
started_at=task.started_at,
completed_at=task.completed_at,
error_message=str(e)
)
except:
pass
finally:
# 清理运行中的任务记录
with self.lock:
if task_id in self.running_tasks:
del self.running_tasks[task_id]
# 从取消集合中移除任务
self.cancelled_tasks.discard(task_id)
def _mark_task_cancelled(self, task_id):
"""标记任务为已取消"""
try:
with transaction.atomic():
task = CrawlTask.objects.select_for_update().get(id=task_id)
task.status = 'cancelled'
task.completed_at = timezone.now()
task.save()
# 记录执行历史
task.add_execution_record(
status='cancelled',
started_at=task.started_at,
completed_at=task.completed_at,
error_message='任务被取消'
)
except CrawlTask.DoesNotExist:
pass
def _execute_keyword_task(self, task):
"""执行关键词搜索任务"""
task_id = task.id
# 检查任务是否已被取消
if self.is_task_cancelled(task_id):
self._mark_task_cancelled(task_id)
return
# 更新当前操作
task.current_action = "开始关键词搜索"
task.save()
# 准备参数
selected_websites = task.websites.all()
if selected_websites:
websites = [w.name for w in selected_websites]
else:
websites = list(WEBSITE_CRAWL_CONFIGS.keys())
start_date = task.start_date.strftime('%Y-%m-%d') if task.start_date else None
end_date = task.end_date.strftime('%Y-%m-%d') if task.end_date else None
# 设置任务ID以便在爬虫函数中检查取消状态
crawl_by_keyword.task_id = task_id
# 使用新的关键词爬虫引擎
try:
# 延迟导入以避免循环依赖
from core.keyword_crawler import KeywordCrawler
crawler = KeywordCrawler(task_id, self)
crawler.run()
# 检查任务是否已被取消
if self.is_task_cancelled(task_id):
self._mark_task_cancelled(task_id)
return
# 检查任务是否已被取消
if self.is_task_cancelled(task_id):
self._mark_task_cancelled(task_id)
return
# 更新结果统计
task = CrawlTask.objects.get(id=task_id)
if task.status == 'completed':
# 统计爬取的文章数量
from core.models import Article
article_count = Article.objects.filter(website__in=task.websites.all()).count()
task.total_articles = article_count
task.success_count = article_count
task.failed_count = 0
task.result_details = {
'total_articles': article_count,
'success_count': article_count,
'failed_count': 0,
'keyword': task.keyword,
'websites': [w.name for w in task.websites.all()]
}
task.save()
# 添加执行记录
task.add_execution_record(
status='completed',
started_at=task.started_at,
completed_at=task.completed_at
)
elif self.is_task_cancelled(task_id):
self._mark_task_cancelled(task_id)
return
except Exception as e:
if self.is_task_cancelled(task_id):
self._mark_task_cancelled(task_id)
return
# 更新任务状态为失败
task = CrawlTask.objects.get(id=task_id)
task.status = 'failed'
task.error_message = str(e)
task.completed_at = timezone.now()
task.save()
# 添加执行记录
task.add_execution_record(
status='failed',
started_at=task.started_at,
completed_at=task.completed_at,
error_message=str(e)
)
raise e
def _execute_historical_task(self, task):
"""执行历史文章任务"""
task_id = task.id
# 检查任务是否已被取消
if self.is_task_cancelled(task_id):
self._mark_task_cancelled(task_id)
return
# 更新当前操作
task.current_action = "开始历史文章爬取"
task.save()
# 准备参数
selected_websites = task.websites.all()
if selected_websites:
websites = [w.name for w in selected_websites]
else:
websites = list(WEBSITE_CRAWL_CONFIGS.keys())
start_date = task.start_date.strftime('%Y-%m-%d') if task.start_date else None
end_date = task.end_date.strftime('%Y-%m-%d') if task.end_date else None
# 设置任务ID以便在爬虫函数中检查取消状态
crawl_historical_articles.task_id = task_id
# 执行爬取
try:
results = crawl_historical_articles(
website_names=websites,
start_date=start_date,
end_date=end_date,
max_articles_per_site=task.max_articles
)
except Exception as e:
if self.is_task_cancelled(task_id):
self._mark_task_cancelled(task_id)
return
raise e
# 检查任务是否已被取消
if self.is_task_cancelled(task_id):
self._mark_task_cancelled(task_id)
return
# 更新结果
task.total_articles = results['total_articles']
task.success_count = results['success_count']
task.failed_count = results['failed_count']
task.result_details = results['website_results']
task.save()
def _execute_full_site_task(self, task):
"""执行全站爬取任务"""
task_id = task.id
# 检查任务是否已被取消
if self.is_task_cancelled(task_id):
self._mark_task_cancelled(task_id)
return
# 更新当前操作
task.current_action = "开始全站爬取"
task.save()
# 准备参数
selected_websites = task.websites.all()
if selected_websites:
websites = [w.name for w in selected_websites]
else:
websites = list(WEBSITE_CRAWL_CONFIGS.keys())
total_websites = len(websites)
completed_websites = 0
for website_name in websites:
# 检查任务是否已被取消
if self.is_task_cancelled(task_id):
self._mark_task_cancelled(task_id)
return
try:
# 更新当前网站
task.current_website = website_name
task.current_action = f"正在爬取 {website_name}"
task.save()
# 获取或创建网站对象
from core.models import Website
website, created = Website.objects.get_or_create(
name=website_name,
defaults={
'base_url': WEBSITE_CRAWL_CONFIGS[website_name]["base_url"],
'enabled': True
}
)
# 设置任务ID以便在爬虫函数中检查取消状态
full_site_crawler.task_id = task_id
# 执行全站爬取
try:
full_site_crawler(
WEBSITE_CRAWL_CONFIGS[website_name]["base_url"],
website,
max_pages=task.max_pages
)
except Exception as e:
if self.is_task_cancelled(task_id):
self._mark_task_cancelled(task_id)
return
raise e
completed_websites += 1
progress = int((completed_websites / total_websites) * 100)
task.progress = progress
task.save()
except Exception as e:
# 检查任务是否已被取消
if self.is_task_cancelled(task_id):
self._mark_task_cancelled(task_id)
return
# 记录错误但继续处理其他网站
print(f"爬取网站 {website_name} 时出错: {e}")
continue
# 检查任务是否已被取消
if self.is_task_cancelled(task_id):
self._mark_task_cancelled(task_id)
return
# 更新最终结果
task.total_articles = completed_websites # 这里可以改为实际爬取的文章数
task.success_count = completed_websites
task.failed_count = total_websites - completed_websites
task.save()
def get_task_status(self, task_id):
"""获取任务状态"""
try:
task = CrawlTask.objects.get(id=task_id)
return {
'status': task.status,
'progress': task.progress,
'current_website': task.current_website,
'current_action': task.current_action,
'total_articles': task.total_articles,
'success_count': task.success_count,
'failed_count': task.failed_count,
'error_message': task.error_message
}
except CrawlTask.DoesNotExist:
return None
# 全局任务执行器实例
task_executor = TaskExecutor()

227
core/tasks.py Normal file
View File

@@ -0,0 +1,227 @@
import logging
from celery import shared_task
from django.core.management import call_command
# from django.conf import settings
from .models import Website, Article
from .utils import full_site_crawler
logger = logging.getLogger(__name__)
@shared_task(bind=True, max_retries=3)
def crawl_website(self, website_id, node_id=None, batch_id=None):
"""
爬取单个网站的任务
"""
try:
website = Website.objects.get(id=website_id)
logger.info(f"开始爬取网站: {website.name} (节点: {node_id}, 批次: {batch_id})")
logger.info(f"网站URL: {website.base_url}")
# 记录任务开始
if node_id and batch_id:
from .distributed_crawler import distributed_crawler
distributed_crawler.heartbeat(node_id, 1)
logger.info(f"分布式爬虫心跳已发送 - 节点: {node_id}, 状态: 1")
# 调用爬虫函数
logger.info(f"开始调用 full_site_crawler 函数处理网站: {website.name}")
full_site_crawler(website.base_url, website, max_pages=100)
logger.info(f"完成调用 full_site_crawler 函数处理网站: {website.name}")
# 统计结果
article_count = website.article_set.count()
logger.info(f"网站 {website.name} 爬取完成,共 {article_count} 篇文章")
# 记录任务完成
if node_id and batch_id:
distributed_crawler.heartbeat(node_id, 0)
logger.info(f"分布式爬虫心跳已发送 - 节点: {node_id}, 状态: 0")
result = {
'website_id': website_id,
'website_name': website.name,
'article_count': article_count,
'status': 'success',
'node_id': node_id,
'batch_id': batch_id
}
logger.info(f"任务完成,返回结果: {result}")
return result
except Website.DoesNotExist:
error_msg = f"网站不存在: {website_id}"
logger.error(error_msg)
raise
except Exception as exc:
error_msg = f"爬取网站 {website_id} 失败: {exc}"
logger.error(error_msg)
# 重试任务
logger.info(f"准备重试任务将在5分钟后重试")
raise self.retry(exc=exc, countdown=60 * 5) # 5分钟后重试
@shared_task(bind=True, max_retries=3)
def crawl_all_websites(self):
"""
爬取所有网站的任务
"""
try:
logger.info("开始批量爬取所有网站")
# 获取所有启用的网站
websites = Website.objects.filter(enabled=True)
total_websites = websites.count()
logger.info(f"找到 {total_websites} 个启用的网站")
results = []
for website in websites:
try:
logger.info(f"启动网站 {website.name} 的爬取任务")
# 调用单个网站爬取任务
result = crawl_website.delay(website.id)
logger.info(f"网站 {website.name} 的爬取任务已启动任务ID: {result.id}")
results.append({
'website_id': website.id,
'website_name': website.name,
'task_id': result.id
})
except Exception as e:
error_msg = f"启动网站 {website.name} 爬取任务失败: {e}"
logger.error(error_msg)
results.append({
'website_id': website.id,
'website_name': website.name,
'error': str(e)
})
logger.info(f"批量爬取任务启动完成,共 {total_websites} 个网站")
return {
'total_websites': total_websites,
'results': results,
'status': 'started'
}
except Exception as exc:
error_msg = f"批量爬取任务失败: {exc}"
logger.error(error_msg)
raise self.retry(exc=exc, countdown=60 * 10) # 10分钟后重试
@shared_task
def crawl_specific_media(media_list):
"""
爬取指定媒体的任务
"""
try:
logger.info(f"开始爬取指定媒体: {media_list}")
# 调用管理命令
logger.info("调用 crawl_all_media 管理命令")
call_command('crawl_all_media', media=','.join(media_list))
logger.info("crawl_all_media 管理命令执行完成")
return {
'media_list': media_list,
'status': 'success'
}
except Exception as e:
error_msg = f"爬取指定媒体失败: {e}"
logger.error(error_msg)
raise
@shared_task
def cleanup_old_articles(days=30):
"""
清理旧文章的任务
"""
try:
from django.utils import timezone
from datetime import timedelta
cutoff_date = timezone.now() - timedelta(days=days)
logger.info(f"查找 {days} 天前的文章,截止日期: {cutoff_date}")
old_articles = Article.objects.filter(created_at__lt=cutoff_date)
count = old_articles.count()
logger.info(f"找到 {count} 篇旧文章")
old_articles.delete()
logger.info(f"已删除 {count} 篇旧文章")
logger.info(f"清理了 {count} 篇旧文章({days}天前)")
return {
'deleted_count': count,
'cutoff_date': cutoff_date.isoformat(),
'status': 'success'
}
except Exception as e:
error_msg = f"清理旧文章失败: {e}"
logger.error(error_msg)
raise
@shared_task
def export_articles():
"""
导出文章的任务
"""
try:
logger.info("开始导出文章")
# 调用导出命令
logger.info("调用 export_articles 管理命令")
call_command('export_articles')
logger.info("export_articles 管理命令执行完成")
return {
'status': 'success',
'message': '文章导出完成'
}
except Exception as e:
error_msg = f"导出文章失败: {e}"
logger.error(error_msg)
raise
@shared_task
def health_check():
"""
健康检查任务
"""
try:
logger.info("开始执行健康检查")
# 检查数据库连接
website_count = Website.objects.count()
article_count = Article.objects.count()
logger.info(f"数据库状态正常 - 网站数量: {website_count}, 文章数量: {article_count}")
# 检查Redis连接
from django.core.cache import cache
logger.info("检查Redis连接")
cache.set('health_check', 'ok', 60)
cache_result = cache.get('health_check')
logger.info(f"Redis连接状态: {'正常' if cache_result == 'ok' else '异常'}")
result = {
'database': 'ok',
'redis': 'ok' if cache_result == 'ok' else 'error',
'website_count': website_count,
'article_count': article_count,
'status': 'healthy'
}
logger.info(f"健康检查完成,结果: {result}")
return result
except Exception as e:
error_msg = f"健康检查失败: {e}"
logger.error(error_msg)
return {
'status': 'unhealthy',
'error': str(e)
}

View File

@@ -0,0 +1,21 @@
{% extends "admin/change_list.html" %}
{% load admin_urls %}
{% block object-tools %}
{{ block.super }}
<!--
<div style="margin-top: 10px;">
<form method="post" action="{% url 'admin:run_crawler' %}" style="display: inline-block;">
{% csrf_token %}
<label for="website-select">选择网站:</label>
<select name="website_name" id="website-select" required>
<option value="">-- 请选择网站 --</option>
{% for website in cl.model_admin.get_websites %}
<option value="{{ website.name }}">{{ website.name }}</option>
{% endfor %}
</select>
<input type="submit" value="执行爬虫" class="default" style="margin-left: 10px;"/>
</form>
</div>
-->
{% endblock %}

View File

@@ -0,0 +1,304 @@
{% extends "admin/base_site.html" %}
{% load static %}
{% block title %}爬虫状态 - {{ site_title|default:_('Django site admin') }}{% endblock %}
{% block extrastyle %}
<style>
.status-card {
background: white;
border: 1px solid #ddd;
border-radius: 8px;
padding: 20px;
margin: 20px 0;
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
}
.status-header {
display: flex;
justify-content: space-between;
align-items: center;
margin-bottom: 20px;
padding-bottom: 10px;
border-bottom: 2px solid #f0f0f0;
}
.status-title {
font-size: 24px;
font-weight: bold;
color: #333;
}
.stats-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
gap: 20px;
margin-bottom: 30px;
}
.stat-card {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
padding: 20px;
border-radius: 8px;
text-align: center;
}
.stat-number {
font-size: 32px;
font-weight: bold;
margin-bottom: 5px;
}
.stat-label {
font-size: 14px;
opacity: 0.9;
}
.nodes-section, .batches-section {
margin-top: 30px;
}
.section-title {
font-size: 20px;
font-weight: bold;
margin-bottom: 15px;
color: #333;
}
.node-item, .batch-item {
background: #f8f9fa;
border: 1px solid #e9ecef;
border-radius: 6px;
padding: 15px;
margin-bottom: 10px;
}
.node-header, .batch-header {
display: flex;
justify-content: space-between;
align-items: center;
margin-bottom: 10px;
}
.node-name, .batch-id {
font-weight: bold;
color: #333;
}
.node-status, .batch-status {
padding: 4px 8px;
border-radius: 4px;
font-size: 12px;
font-weight: bold;
}
.status-active {
background: #d4edda;
color: #155724;
}
.status-running {
background: #fff3cd;
color: #856404;
}
.status-completed {
background: #d1ecf1;
color: #0c5460;
}
.status-failed {
background: #f8d7da;
color: #721c24;
}
.node-details, .batch-details {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
gap: 10px;
font-size: 14px;
}
.detail-item {
display: flex;
justify-content: space-between;
}
.detail-label {
color: #666;
}
.detail-value {
font-weight: bold;
color: #333;
}
.progress-bar {
width: 100%;
height: 8px;
background: #e9ecef;
border-radius: 4px;
overflow: hidden;
margin-top: 10px;
}
.progress-fill {
height: 100%;
background: linear-gradient(90deg, #28a745, #20c997);
transition: width 0.3s ease;
}
.refresh-btn {
background: #007bff;
color: white;
border: none;
padding: 8px 16px;
border-radius: 4px;
cursor: pointer;
font-size: 14px;
}
.refresh-btn:hover {
background: #0056b3;
}
.no-data {
text-align: center;
color: #666;
padding: 40px;
font-style: italic;
}
</style>
{% endblock %}
{% block content %}
<div class="status-card">
<div class="status-header">
<h1 class="status-title">爬虫状态监控</h1>
<button class="refresh-btn" onclick="location.reload()">刷新</button>
</div>
<!-- 统计卡片 -->
<div class="stats-grid">
<div class="stat-card">
<div class="stat-number">{{ task_stats.total_nodes }}</div>
<div class="stat-label">活跃节点</div>
</div>
<div class="stat-card">
<div class="stat-number">{{ task_stats.active_tasks }}</div>
<div class="stat-label">运行中任务</div>
</div>
<div class="stat-card">
<div class="stat-number">{{ task_stats.total_batches }}</div>
<div class="stat-label">总批次</div>
</div>
<div class="stat-card">
<div class="stat-number">{{ nodes|length }}</div>
<div class="stat-label">在线节点</div>
</div>
</div>
<!-- 节点状态 -->
<div class="nodes-section">
<h2 class="section-title">爬虫节点状态</h2>
{% if nodes %}
{% for node in nodes %}
<div class="node-item">
<div class="node-header">
<span class="node-name">{{ node.node_id }}</span>
<span class="node-status status-active">{{ node.status }}</span>
</div>
<div class="node-details">
<div class="detail-item">
<span class="detail-label">活跃任务:</span>
<span class="detail-value">{{ node.active_tasks }}</span>
</div>
<div class="detail-item">
<span class="detail-label">完成任务:</span>
<span class="detail-value">{{ node.completed_tasks }}</span>
</div>
<div class="detail-item">
<span class="detail-label">失败任务:</span>
<span class="detail-value">{{ node.failed_tasks }}</span>
</div>
<div class="detail-item">
<span class="detail-label">最后心跳:</span>
<span class="detail-value">
{% if node.last_heartbeat %}
{{ node.last_heartbeat|date:"H:i:s" }}
{% else %}
未知
{% endif %}
</span>
</div>
</div>
</div>
{% endfor %}
{% else %}
<div class="no-data">
暂无活跃的爬虫节点
</div>
{% endif %}
</div>
<!-- 批次状态 -->
<div class="batches-section">
<h2 class="section-title">最近批次</h2>
{% if batches %}
{% for batch in batches %}
<div class="batch-item">
<div class="batch-header">
<span class="batch-id">{{ batch.batch_id }}</span>
<span class="batch-status status-{{ batch.status }}">
{% if batch.status == 'running' %}
运行中
{% elif batch.status == 'completed' %}
已完成
{% elif batch.status == 'failed' %}
失败
{% else %}
{{ batch.status }}
{% endif %}
</span>
</div>
<div class="batch-details">
<div class="detail-item">
<span class="detail-label">总任务:</span>
<span class="detail-value">{{ batch.total_tasks }}</span>
</div>
<div class="detail-item">
<span class="detail-label">已完成:</span>
<span class="detail-value">{{ batch.completed_tasks }}</span>
</div>
<div class="detail-item">
<span class="detail-label">失败:</span>
<span class="detail-value">{{ batch.failed_tasks }}</span>
</div>
<div class="detail-item">
<span class="detail-label">进度:</span>
<span class="detail-value">{{ batch.progress|floatformat:1 }}%</span>
</div>
</div>
{% if batch.status == 'running' %}
<div class="progress-bar">
<div class="progress-fill" style="width: {{ batch.progress }}%"></div>
</div>
{% endif %}
</div>
{% endfor %}
{% else %}
<div class="no-data">
暂无批次记录
</div>
{% endif %}
</div>
</div>
<script>
// 自动刷新页面
setTimeout(function () {
location.reload();
}, 30000); // 30秒刷新一次
</script>
{% endblock %}

View File

@@ -0,0 +1,139 @@
{% extends "admin/base_site.html" %}
{% load i18n admin_urls static admin_modify %}
{% block title %}{{ title }} | {{ site_title|default:_('Django site admin') }}{% endblock %}
{% block breadcrumbs %}
<div class="breadcrumbs">
<a href="{% url 'admin:index' %}">{% trans 'Home' %}</a>
&rsaquo; <a href="{% url 'admin:core_crawltask_changelist' %}">爬取任务</a>
&rsaquo; {{ title }}
</div>
{% endblock %}
{% block content %}
<h1>{{ title }}</h1>
<div class="help" style="background: #fff3cd; border: 1px solid #ffeaa7; padding: 15px; margin-bottom: 20px; border-radius: 5px;">
<strong>注意:</strong>全站爬取会爬取整个网站的所有文章,可能需要很长时间。建议在非高峰时段进行。
</div>
<form method="post" id="full-site-task-form">
{% csrf_token %}
<fieldset class="module aligned">
<h2>基本信息</h2>
<div class="form-row">
<div>
<label for="id_name" class="required">任务名称:</label>
<input type="text" name="name" id="id_name" required maxlength="200" style="width: 300px;">
<p class="help">为这个全站爬取任务起一个容易识别的名称</p>
</div>
</div>
</fieldset>
<fieldset class="module aligned">
<h2>目标网站</h2>
<div class="form-row">
<div>
<label>选择要爬取的网站:</label>
<div style="max-height: 200px; overflow-y: auto; border: 1px solid #ddd; padding: 10px; margin-top: 5px;">
<label style="display: block; margin: 5px 0;">
<input type="checkbox" id="select_all" onchange="toggleAllWebsites()">
<strong>全选/取消全选</strong>
</label>
<hr style="margin: 10px 0;">
{% for website in websites %}
<label style="display: block; margin: 3px 0;">
<input type="checkbox" name="websites" value="{{ website.name }}" class="website-checkbox">
{{ website.name }}
</label>
{% endfor %}
</div>
<p class="help">不选择任何网站将爬取所有支持的网站</p>
</div>
</div>
</fieldset>
<fieldset class="module aligned">
<h2>爬取设置</h2>
<div class="form-row">
<div>
<label for="id_max_pages">最大爬取页数:</label>
<input type="number" name="max_pages" id="id_max_pages" value="500" min="1" max="5000" style="width: 100px;">
<p class="help">每个网站最多爬取的页数 (1-5000)</p>
</div>
</div>
</fieldset>
<div class="submit-row">
<input type="submit" value="创建任务" class="default" name="_save">
<a href="{% url 'admin:core_crawltask_changelist' %}" class="button cancel-link">取消</a>
</div>
</form>
<script>
function toggleAllWebsites() {
const selectAll = document.getElementById('select_all');
const checkboxes = document.querySelectorAll('.website-checkbox');
checkboxes.forEach(checkbox => {
checkbox.checked = selectAll.checked;
});
}
</script>
<style>
.form-row {
margin-bottom: 15px;
}
.form-row label {
display: block;
font-weight: bold;
margin-bottom: 5px;
}
.form-row input[type="text"],
.form-row input[type="number"] {
padding: 5px;
border: 1px solid #ddd;
border-radius: 3px;
}
.form-row .help {
color: #666;
font-size: 12px;
margin-top: 3px;
}
.submit-row {
margin-top: 20px;
padding-top: 20px;
border-top: 1px solid #ddd;
}
.submit-row input[type="submit"] {
background: #417690;
color: white;
padding: 10px 20px;
border: none;
border-radius: 3px;
cursor: pointer;
}
.submit-row .cancel-link {
margin-left: 10px;
padding: 10px 20px;
background: #f8f8f8;
color: #333;
text-decoration: none;
border-radius: 3px;
border: 1px solid #ddd;
}
.submit-row .cancel-link:hover {
background: #e8e8e8;
}
</style>
{% endblock %}

View File

@@ -0,0 +1,164 @@
{% extends "admin/base_site.html" %}
{% load i18n admin_urls static admin_modify %}
{% block title %}{{ title }} | {{ site_title|default:_('Django site admin') }}{% endblock %}
{% block breadcrumbs %}
<div class="breadcrumbs">
<a href="{% url 'admin:index' %}">{% trans 'Home' %}</a>
&rsaquo; <a href="{% url 'admin:core_crawltask_changelist' %}">爬取任务</a>
&rsaquo; {{ title }}
</div>
{% endblock %}
{% block content %}
<h1>{{ title }}</h1>
<form method="post" id="historical-task-form">
{% csrf_token %}
<fieldset class="module aligned">
<h2>基本信息</h2>
<div class="form-row">
<div>
<label for="id_name" class="required">任务名称:</label>
<input type="text" name="name" id="id_name" required maxlength="200" style="width: 300px;">
<p class="help">为这个历史文章爬取任务起一个容易识别的名称</p>
</div>
</div>
</fieldset>
<fieldset class="module aligned">
<h2>目标网站</h2>
<div class="form-row">
<div>
<label>选择要爬取的网站:</label>
<div style="max-height: 200px; overflow-y: auto; border: 1px solid #ddd; padding: 10px; margin-top: 5px;">
<label style="display: block; margin: 5px 0;">
<input type="checkbox" id="select_all" onchange="toggleAllWebsites()">
<strong>全选/取消全选</strong>
</label>
<hr style="margin: 10px 0;">
{% for website in websites %}
<label style="display: block; margin: 3px 0;">
<input type="checkbox" name="websites" value="{{ website.name }}" class="website-checkbox">
{{ website.name }}
</label>
{% endfor %}
</div>
<p class="help">不选择任何网站将爬取所有支持的网站</p>
</div>
</div>
</fieldset>
<fieldset class="module aligned">
<h2>时间范围</h2>
<div class="form-row">
<div>
<label for="id_start_date" class="required">开始日期:</label>
<input type="date" name="start_date" id="id_start_date" required>
<p class="help">历史文章的开始日期</p>
</div>
</div>
<div class="form-row">
<div>
<label for="id_end_date" class="required">结束日期:</label>
<input type="date" name="end_date" id="id_end_date" required>
<p class="help">历史文章的结束日期</p>
</div>
</div>
</fieldset>
<fieldset class="module aligned">
<h2>爬取设置</h2>
<div class="form-row">
<div>
<label for="id_max_articles">每个网站最大文章数:</label>
<input type="number" name="max_articles" id="id_max_articles" value="50" min="1" max="500" style="width: 100px;">
<p class="help">每个网站最多爬取的文章数量 (1-500)</p>
</div>
</div>
</fieldset>
<div class="submit-row">
<input type="submit" value="创建任务" class="default" name="_save">
<a href="{% url 'admin:core_crawltask_changelist' %}" class="button cancel-link">取消</a>
</div>
</form>
<script>
function toggleAllWebsites() {
const selectAll = document.getElementById('select_all');
const checkboxes = document.querySelectorAll('.website-checkbox');
checkboxes.forEach(checkbox => {
checkbox.checked = selectAll.checked;
});
}
// 设置默认日期
document.addEventListener('DOMContentLoaded', function() {
const today = new Date();
const oneMonthAgo = new Date(today.getFullYear(), today.getMonth() - 1, today.getDate());
document.getElementById('id_end_date').value = today.toISOString().split('T')[0];
document.getElementById('id_start_date').value = oneMonthAgo.toISOString().split('T')[0];
});
</script>
<style>
.form-row {
margin-bottom: 15px;
}
.form-row label {
display: block;
font-weight: bold;
margin-bottom: 5px;
}
.form-row input[type="text"],
.form-row input[type="number"],
.form-row input[type="date"] {
padding: 5px;
border: 1px solid #ddd;
border-radius: 3px;
}
.form-row .help {
color: #666;
font-size: 12px;
margin-top: 3px;
}
.submit-row {
margin-top: 20px;
padding-top: 20px;
border-top: 1px solid #ddd;
}
.submit-row input[type="submit"] {
background: #417690;
color: white;
padding: 10px 20px;
border: none;
border-radius: 3px;
cursor: pointer;
}
.submit-row .cancel-link {
margin-left: 10px;
padding: 10px 20px;
background: #f8f8f8;
color: #333;
text-decoration: none;
border-radius: 3px;
border: 1px solid #ddd;
}
.submit-row .cancel-link:hover {
background: #e8e8e8;
}
</style>
{% endblock %}

View File

@@ -0,0 +1,180 @@
{% extends "admin/base_site.html" %}
{% load i18n admin_urls static admin_modify %}
{% block title %}{{ title }} | {{ site_title|default:_('Django site admin') }}{% endblock %}
{% block breadcrumbs %}
<div class="breadcrumbs">
<a href="{% url 'admin:index' %}">{% trans 'Home' %}</a>
&rsaquo; <a href="{% url 'admin:core_crawltask_changelist' %}">爬取任务</a>
&rsaquo; {{ title }}
</div>
{% endblock %}
{% block content %}
<h1>{{ title }}</h1>
<form method="post" id="keyword-task-form">
{% csrf_token %}
<fieldset class="module aligned">
<h2>基本信息</h2>
<div class="form-row">
<div>
<label for="id_name" class="required">任务名称:</label>
<input type="text" name="name" id="id_name" required maxlength="200" style="width: 300px;">
<p class="help">为这个爬取任务起一个容易识别的名称</p>
</div>
</div>
<div class="form-row">
<div>
<label for="id_keyword" class="required">搜索关键词:</label>
<input type="text" name="keyword" id="id_keyword" required maxlength="200" style="width: 300px;">
<p class="help">输入要搜索的关键词,例如:人工智能、两会、政策等</p>
</div>
</div>
</fieldset>
<fieldset class="module aligned">
<h2>目标网站</h2>
<div class="form-row">
<div>
<label>选择要爬取的网站:</label>
<div style="max-height: 200px; overflow-y: auto; border: 1px solid #ddd; padding: 10px; margin-top: 5px;">
<label style="display: block; margin: 5px 0;">
<input type="checkbox" id="select_all" onchange="toggleAllWebsites()">
<strong>全选/取消全选</strong>
</label>
<hr style="margin: 10px 0;">
{% for website in websites %}
<label style="display: block; margin: 3px 0;">
<input type="checkbox" name="websites" value="{{ website.name }}" class="website-checkbox">
{{ website.name }}
</label>
{% endfor %}
</div>
<p class="help">不选择任何网站将爬取所有支持的网站</p>
</div>
</div>
</fieldset>
<fieldset class="module aligned">
<h2>时间范围 (可选)</h2>
<div class="form-row">
<div>
<label for="id_start_date">开始日期:</label>
<input type="date" name="start_date" id="id_start_date">
<p class="help">留空则搜索所有时间</p>
</div>
</div>
<div class="form-row">
<div>
<label for="id_end_date">结束日期:</label>
<input type="date" name="end_date" id="id_end_date">
<p class="help">留空则搜索到当前时间</p>
</div>
</div>
</fieldset>
<fieldset class="module aligned">
<h2>爬取设置</h2>
<div class="form-row">
<div>
<label for="id_max_pages">最大搜索页数:</label>
<input type="number" name="max_pages" id="id_max_pages" value="10" min="1" max="100" style="width: 100px;">
<p class="help">每个网站最多搜索的页数 (1-100)</p>
</div>
</div>
<div class="form-row">
<div>
<label for="id_max_articles">最大文章数量:</label>
<input type="number" name="max_articles" id="id_max_articles" value="100" min="1" max="1000" style="width: 100px;">
<p class="help">总共最多爬取的文章数量 (1-1000)</p>
</div>
</div>
</fieldset>
<div class="submit-row">
<input type="submit" value="创建任务" class="default" name="_save">
<a href="{% url 'admin:core_crawltask_changelist' %}" class="button cancel-link">取消</a>
</div>
</form>
<script>
function toggleAllWebsites() {
const selectAll = document.getElementById('select_all');
const checkboxes = document.querySelectorAll('.website-checkbox');
checkboxes.forEach(checkbox => {
checkbox.checked = selectAll.checked;
});
}
// 设置默认日期
document.addEventListener('DOMContentLoaded', function() {
const today = new Date();
const oneMonthAgo = new Date(today.getFullYear(), today.getMonth() - 1, today.getDate());
document.getElementById('id_end_date').value = today.toISOString().split('T')[0];
document.getElementById('id_start_date').value = oneMonthAgo.toISOString().split('T')[0];
});
</script>
<style>
.form-row {
margin-bottom: 15px;
}
.form-row label {
display: block;
font-weight: bold;
margin-bottom: 5px;
}
.form-row input[type="text"],
.form-row input[type="number"],
.form-row input[type="date"] {
padding: 5px;
border: 1px solid #ddd;
border-radius: 3px;
}
.form-row .help {
color: #666;
font-size: 12px;
margin-top: 3px;
}
.submit-row {
margin-top: 20px;
padding-top: 20px;
border-top: 1px solid #ddd;
}
.submit-row input[type="submit"] {
background: #417690;
color: white;
padding: 10px 20px;
border: none;
border-radius: 3px;
cursor: pointer;
}
.submit-row .cancel-link {
margin-left: 10px;
padding: 10px 20px;
background: #f8f8f8;
color: #333;
text-decoration: none;
border-radius: 3px;
border: 1px solid #ddd;
}
.submit-row .cancel-link:hover {
background: #e8e8e8;
}
</style>
{% endblock %}

View File

@@ -0,0 +1,172 @@
{% extends "admin/base_site.html" %}
{% load i18n static %}
{% block extrastyle %}{{ block.super }}<link rel="stylesheet" type="text/css" href="{% static "admin/css/dashboard.css" %}">{% endblock %}
{% block coltype %}colMS{% endblock %}
{% block bodyclass %}{{ block.super }} dashboard{% endblock %}
{% block breadcrumbs %}{% endblock %}
{% block nav-sidebar %}{% endblock %}
{% block content %}
<div id="content-main">
{% if app_list %}
{% for app in app_list %}
<div class="app-{{ app.app_label }} module">
<table>
<caption>
<a href="{{ app.app_url }}" class="section" title="{% blocktranslate with name=app.name %}Models in the {{ name }} application{% endblocktranslate %}">{{ app.name }}</a>
</caption>
{% for model in app.models %}
<tr class="model-{{ model.object_name|lower }}">
{% if model.admin_url %}
<th scope="row"><a href="{{ model.admin_url }}"{% if model.add_url %} class="addlink"{% endif %}>{{ model.name }}</a></th>
{% else %}
<th scope="row">{{ model.name }}</th>
{% endif %}
{% if model.add_url %}
<td><a href="{{ model.add_url }}" class="addlink">{% translate 'Add' %}</a></td>
{% else %}
<td>&nbsp;</td>
{% endif %}
{% if model.admin_url %}
{% if model.view_only %}
<td><a href="{{ model.admin_url }}" class="viewlink">{% translate 'View' %}</a></td>
{% else %}
<td><a href="{{ model.admin_url }}" class="changelink">{% translate 'Change' %}</a></td>
{% endif %}
{% else %}
<td>&nbsp;</td>
{% endif %}
</tr>
{% endfor %}
</table>
</div>
{% endfor %}
{% else %}
<p>{% translate "You don't have permission to view or edit anything." %}</p>
{% endif %}
<!-- 自定义快速操作区域 -->
<div class="module" style="margin-top: 20px;">
<h2>快速创建爬取任务</h2>
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 15px; margin-top: 15px;">
<div style="border: 1px solid #ddd; padding: 15px; border-radius: 5px; text-align: center;">
<h3 style="margin-top: 0; color: #417690;">关键词搜索</h3>
<p style="color: #666; font-size: 14px;">根据关键词搜索并爬取相关文章</p>
<a href="{% url 'admin:create_keyword_task' %}" class="button" style="background: #417690; color: white; padding: 8px 16px; text-decoration: none; border-radius: 3px; display: inline-block;">
创建任务
</a>
</div>
<div style="border: 1px solid #ddd; padding: 15px; border-radius: 5px; text-align: center;">
<h3 style="margin-top: 0; color: #28a745;">历史文章</h3>
<p style="color: #666; font-size: 14px;">爬取指定日期范围的历史文章</p>
<a href="{% url 'admin:create_historical_task' %}" class="button" style="background: #28a745; color: white; padding: 8px 16px; text-decoration: none; border-radius: 3px; display: inline-block;">
创建任务
</a>
</div>
<div style="border: 1px solid #ddd; padding: 15px; border-radius: 5px; text-align: center;">
<h3 style="margin-top: 0; color: #dc3545;">全站爬取</h3>
<p style="color: #666; font-size: 14px;">爬取整个网站的所有文章</p>
<a href="{% url 'admin:create_full_site_task' %}" class="button" style="background: #dc3545; color: white; padding: 8px 16px; text-decoration: none; border-radius: 3px; display: inline-block;">
创建任务
</a>
</div>
</div>
</div>
<!-- 最近任务状态 -->
<div class="module" style="margin-top: 20px;">
<h2>最近任务状态</h2>
<div style="margin-top: 15px;">
{% load core_extras %}
{% get_recent_tasks as recent_tasks %}
{% if recent_tasks %}
<table style="width: 100%;">
<thead>
<tr style="background: #f8f9fa;">
<th style="padding: 8px; text-align: left;">任务名称</th>
<th style="padding: 8px; text-align: left;">类型</th>
<th style="padding: 8px; text-align: left;">状态</th>
<th style="padding: 8px; text-align: left;">进度</th>
<th style="padding: 8px; text-align: left;">创建时间</th>
<th style="padding: 8px; text-align: left;">操作</th>
</tr>
</thead>
<tbody>
{% for task in recent_tasks %}
<tr>
<td style="padding: 8px;">{{ task.name }}</td>
<td style="padding: 8px;">{{ task.get_task_type_display }}</td>
<td style="padding: 8px;">
<span style="color: {% if task.status == 'completed' %}green{% elif task.status == 'failed' %}red{% elif task.status == 'running' %}blue{% else %}gray{% endif %};">
{{ task.get_status_display }}
</span>
</td>
<td style="padding: 8px;">
{% if task.status == 'running' %}
<div style="width: 100px; background-color: #f0f0f0; border-radius: 3px; overflow: hidden;">
<div style="width: {{ task.progress }}%; background-color: #4CAF50; height: 16px; text-align: center; line-height: 16px; color: white; font-size: 12px;">
{{ task.progress }}%
</div>
</div>
{% else %}
-
{% endif %}
</td>
<td style="padding: 8px;">{{ task.created_at|date:"m-d H:i" }}</td>
<td style="padding: 8px;">
<a href="{% url 'admin:core_crawltask_change' task.id %}" style="color: #417690; text-decoration: none;">查看</a>
</td>
</tr>
{% endfor %}
</tbody>
</table>
{% else %}
<p style="color: #666; text-align: center; padding: 20px;">暂无任务</p>
{% endif %}
</div>
</div>
</div>
{% endblock %}
{% block sidebar %}
<div id="content-related">
<div class="module" id="recent-actions-module">
<h2>{% translate 'Recent actions' %}</h2>
<h3>{% translate 'My actions' %}</h3>
{% load log %}
{% get_admin_log 10 as admin_log for_user user %}
{% if not admin_log %}
<p>{% translate 'None available' %}</p>
{% else %}
<ul class="actionlist">
{% for entry in admin_log %}
<li class="{% if entry.is_addition %}addlink{% endif %}{% if entry.is_change %}changelink{% endif %}{% if entry.is_deletion %}deletelink{% endif %}">
{% if entry.is_deletion or not entry.get_admin_url %}
{{ entry.object_repr }}
{% else %}
<a href="{{ entry.get_admin_url }}">{{ entry.object_repr }}</a>
{% endif %}
<br>
{% if entry.content_type %}
<span class="mini quiet">{% filter capfirst %}{{ entry.content_type.name }}{% endfilter %}</span>
{% else %}
<span class="mini quiet">{% translate 'Unknown content' %}</span>
{% endif %}
</li>
{% endfor %}
</ul>
{% endif %}
</div>
</div>
{% endblock %}

View File

@@ -0,0 +1,184 @@
{% extends "admin/base_site.html" %}
{% load i18n admin_urls static admin_modify %}
{% block title %}{{ title }} | {{ site_title|default:_('Django site admin') }}{% endblock %}
{% block breadcrumbs %}
<div class="breadcrumbs">
<a href="{% url 'admin:index' %}">{% trans 'Home' %}</a>
&rsaquo; <a href="{% url 'admin:core_crawltask_changelist' %}">爬取任务</a>
&rsaquo; {{ title }}
</div>
{% endblock %}
{% block content %}
<h1>{{ title }}</h1>
<div class="results-summary" style="background: #f8f9fa; border: 1px solid #dee2e6; padding: 20px; margin-bottom: 20px; border-radius: 5px;">
<h2>任务概览</h2>
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px;">
<div>
<strong>任务名称:</strong><br>
{{ task.name }}
</div>
<div>
<strong>任务类型:</strong><br>
{{ task.get_task_type_display }}
</div>
<div>
<strong>状态:</strong><br>
<span style="color: {% if task.status == 'completed' %}green{% elif task.status == 'failed' %}red{% elif task.status == 'running' %}blue{% else %}gray{% endif %};">
{{ task.get_status_display }}
</span>
</div>
<div>
<strong>创建时间:</strong><br>
{{ task.created_at|date:"Y-m-d H:i:s" }}
</div>
{% if task.started_at %}
<div>
<strong>开始时间:</strong><br>
{{ task.started_at|date:"Y-m-d H:i:s" }}
</div>
{% endif %}
{% if task.completed_at %}
<div>
<strong>完成时间:</strong><br>
{{ task.completed_at|date:"Y-m-d H:i:s" }}
</div>
{% endif %}
{% if task.get_duration %}
<div>
<strong>执行时长:</strong><br>
{{ task.duration_display }}
</div>
{% endif %}
</div>
</div>
<div class="results-stats" style="background: #fff; border: 1px solid #dee2e6; padding: 20px; margin-bottom: 20px; border-radius: 5px;">
<h2>统计信息</h2>
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); gap: 15px;">
<div style="text-align: center; padding: 15px; background: #e3f2fd; border-radius: 5px;">
<div style="font-size: 24px; font-weight: bold; color: #1976d2;">{{ task.total_articles }}</div>
<div>总文章数</div>
</div>
<div style="text-align: center; padding: 15px; background: #e8f5e8; border-radius: 5px;">
<div style="font-size: 24px; font-weight: bold; color: #388e3c;">{{ task.success_count }}</div>
<div>成功数</div>
</div>
<div style="text-align: center; padding: 15px; background: #ffebee; border-radius: 5px;">
<div style="font-size: 24px; font-weight: bold; color: #d32f2f;">{{ task.failed_count }}</div>
<div>失败数</div>
</div>
{% if task.total_articles > 0 %}
<div style="text-align: center; padding: 15px; background: #fff3e0; border-radius: 5px;">
<div style="font-size: 24px; font-weight: bold; color: #f57c00;">
{% widthratio task.success_count task.total_articles 100 %}%
</div>
<div>成功率</div>
</div>
{% endif %}
</div>
</div>
{% if task.keyword %}
<div class="task-config" style="background: #fff; border: 1px solid #dee2e6; padding: 20px; margin-bottom: 20px; border-radius: 5px;">
<h2>任务配置</h2>
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px;">
<div>
<strong>搜索关键词:</strong><br>
{{ task.keyword }}
</div>
<div>
<strong>目标网站:</strong><br>
{{ task.get_websites_display }}
</div>
{% if task.start_date %}
<div>
<strong>开始日期:</strong><br>
{{ task.start_date }}
</div>
{% endif %}
{% if task.end_date %}
<div>
<strong>结束日期:</strong><br>
{{ task.end_date }}
</div>
{% endif %}
<div>
<strong>最大页数:</strong><br>
{{ task.max_pages }}
</div>
<div>
<strong>最大文章数:</strong><br>
{{ task.max_articles }}
</div>
</div>
</div>
{% endif %}
{% if task.current_website or task.current_action %}
<div class="current-status" style="background: #fff; border: 1px solid #dee2e6; padding: 20px; margin-bottom: 20px; border-radius: 5px;">
<h2>当前状态</h2>
{% if task.current_website %}
<div>
<strong>当前网站:</strong> {{ task.current_website }}
</div>
{% endif %}
{% if task.current_action %}
<div>
<strong>当前操作:</strong> {{ task.current_action }}
</div>
{% endif %}
{% if task.status == 'running' %}
<div style="margin-top: 10px;">
<div style="width: 100%; background-color: #f0f0f0; border-radius: 10px; overflow: hidden;">
<div style="width: {{ task.progress }}%; background-color: #4CAF50; height: 20px; text-align: center; line-height: 20px; color: white;">
{{ task.progress }}%
</div>
</div>
</div>
{% endif %}
</div>
{% endif %}
{% if task.error_message %}
<div class="error-info" style="background: #ffebee; border: 1px solid #f44336; padding: 20px; margin-bottom: 20px; border-radius: 5px;">
<h2 style="color: #d32f2f;">错误信息</h2>
<pre style="white-space: pre-wrap; word-wrap: break-word;">{{ task.error_message }}</pre>
</div>
{% endif %}
{% if task.result_details %}
<div class="detailed-results" style="background: #fff; border: 1px solid #dee2e6; padding: 20px; margin-bottom: 20px; border-radius: 5px;">
<h2>详细结果</h2>
{% for website, result in task.result_details.items %}
<div style="margin-bottom: 15px; padding: 10px; background: #f8f9fa; border-radius: 3px;">
<strong>{{ website }}:</strong>
<ul style="margin: 5px 0; padding-left: 20px;">
<li>找到链接: {{ result.found_urls }}</li>
<li>已处理: {{ result.processed }}</li>
<li>成功: {{ result.success }}</li>
<li>失败: {{ result.failed }}</li>
{% if result.error %}
<li style="color: red;">错误: {{ result.error }}</li>
{% endif %}
</ul>
</div>
{% endfor %}
</div>
{% endif %}
<div class="actions" style="text-align: center; margin-top: 30px;">
<a href="{% url 'admin:core_crawltask_changelist' %}" class="button" style="padding: 10px 20px; background: #417690; color: white; text-decoration: none; border-radius: 3px; margin-right: 10px;">
返回任务列表
</a>
{% if task.status == 'completed' %}
<a href="{% url 'admin:core_article_changelist' %}" class="button" style="padding: 10px 20px; background: #28a745; color: white; text-decoration: none; border-radius: 3px;">
查看文章
</a>
{% endif %}
</div>
{% endblock %}

View File

@@ -1,17 +1,98 @@
<!DOCTYPE html>
<html lang="zh">
<head>
<meta charset="UTF-8" />
<title>{{ article.title }}</title>
<meta charset="UTF-8"/>
<meta name="viewport" content="width=device-width, initial-scale=1.0"/>
<title>{{ article.title }} - 绿色课堂</title>
<style>
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
line-height: 1.6;
color: #333;
margin: 0 auto;
padding: 20px;
background-color: #f0f8ff;
max-width: 800px;
}
.container {
background: white;
padding: 30px;
margin-bottom: 20px;
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.05);
border-radius: 8px;
}
h1 {
color: #2c3e50;
border-bottom: 2px solid #3498db;
padding-bottom: 10px;
margin-top: 0;
}
.meta {
color: #78909c;
font-size: 0.9em;
margin-bottom: 20px;
}
.content {
margin-top: 20px;
}
/* 优化:确保图片和视频不会超出容器显示 */
.content img, .content video {
max-width: 100%;
height: auto;
display: block;
margin: 10px 0;
}
/* 优化:确保iframe也不会超出容器显示 */
.content iframe {
max-width: 100%;
height: auto;
}
.back-link {
display: inline-block;
margin-bottom: 20px;
color: #1976d2;
text-decoration: none;
}
.back-link:hover {
color: #0d47a1;
text-decoration: underline;
}
@media (max-width: 600px) {
body {
padding: 10px;
}
.container {
padding: 15px;
}
}
</style>
</head>
<body>
<div class="container">
<a href="{% url 'article_list' %}" class="back-link">&laquo; 返回文章列表</a>
<h1>{{ article.title }}</h1>
<p>发布时间: {{ article.pub_date|date:"Y-m-d H:i" }}</p>
<hr />
<div>
<div class="meta">
网站: {{ article.website.name }} |
发布时间: {{ article.pub_date|date:"Y-m-d H:i" }} |
创建时间: {{ article.created_at|date:"Y-m-d H:i" }} |
源网址: <a href="{{ article.url }}" target="_blank">{{ article.url }}</a>
</div>
<div class="content">
{{ article.content|safe }}
</div>
<hr />
<p><a href="{% url 'article_list' %}">返回列表</a></p>
</div>
</body>
</html>
</html>

View File

@@ -1,33 +1,579 @@
<!DOCTYPE html>
<html lang="zh">
<head>
<meta charset="UTF-8" />
<meta charset="UTF-8"/>
<title>绿色课堂文章列表</title>
<style>
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
line-height: 1.6;
color: #333;
margin: 0 auto;
padding: 20px;
background-color: #f0f8ff; /* 统一背景色调 */
}
.container {
background: white;
padding: 30px;
margin-bottom: 20px;
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.05); /* 添加轻微阴影 */
border-radius: 8px; /* 添加圆角 */
}
h1 {
color: #2c3e50;
border-bottom: 2px solid #3498db;
padding-bottom: 10px;
margin-top: 0;
}
.filters {
margin-bottom: 20px;
padding: 15px;
background-color: #e3f2fd; /* 统一滤镜背景色调 */
border-radius: 5px;
}
.filters a {
display: inline-block;
padding: 5px 10px;
margin: 0 5px 5px 0;
background-color: #bbdefb; /* 统一链接背景色调 */
color: #0d47a1;
text-decoration: none;
border-radius: 3px;
}
.filters a.active {
background-color: #3498db;
color: white;
}
ul {
list-style: none;
padding: 0;
}
li {
padding: 10px 0;
border-bottom: 1px solid #e0e0e0; /* 统一分隔线颜色 */
}
li:last-child {
border-bottom: none;
}
a {
color: #1976d2; /* 统一链接颜色 */
text-decoration: none;
}
a:hover {
color: #0d47a1; /* 统一悬停颜色 */
text-decoration: underline;
}
.meta {
color: #78909c; /* 统一元数据颜色 */
font-size: 0.9em;
}
.pagination {
margin-top: 30px;
text-align: center;
padding: 20px 0;
}
.pagination a {
display: inline-block;
padding: 8px 16px;
background-color: #3498db;
color: white;
text-decoration: none;
border-radius: 4px;
margin: 0 2px; /* 修改:调整页码间距 */
}
.pagination a:hover {
background-color: #2980b9;
}
.pagination span {
margin: 0 10px;
color: #7f8c8d;
}
/* 新增:当前页码样式 */
.pagination .current {
background-color: #2980b9;
cursor: default;
}
/* 新增:省略号样式 */
.pagination .ellipsis {
display: inline-block;
padding: 8px 4px;
color: #7f8c8d;
}
/* 新增:搜索框样式 */
.search-form {
margin-bottom: 20px;
padding: 15px;
background-color: #e3f2fd; /* 统一搜索框背景色调 */
border-radius: 5px;
}
.search-form input[type="text"] {
padding: 8px 12px;
border: 1px solid #bbdefb; /* 统一边框颜色 */
border-radius: 4px;
width: 300px;
margin-right: 10px;
background-color: #fff;
}
.search-form input[type="submit"] {
padding: 8px 16px;
background-color: #3498db;
color: white;
border: none;
border-radius: 4px;
cursor: pointer;
}
.search-form input[type="submit"]:hover {
background-color: #2980b9;
}
.search-info {
color: #78909c; /* 统一搜索信息颜色 */
font-size: 0.9em;
margin-bottom: 10px;
}
/* 新增:左侧筛选栏样式 */
.content-wrapper {
display: flex;
gap: 20px;
}
.sidebar {
flex: 0 0 200px;
background-color: #e3f2fd; /* 统一边栏背景色调 */
border-radius: 5px;
padding: 15px;
}
.main-content {
flex: 1;
}
.sidebar .filters {
margin-bottom: 20px;
padding: 0;
background-color: transparent;
}
.sidebar .filters strong {
display: block;
margin-bottom: 10px;
color: #2c3e50;
}
.sidebar .filters a {
display: block;
padding: 8px 10px;
margin: 0 0 5px 0;
background-color: #bbdefb; /* 统一边栏链接背景色调 */
color: #0d47a1;
text-decoration: none;
border-radius: 3px;
}
.sidebar .filters a.active {
background-color: #3498db;
color: white;
}
/* 新增:导出功能样式 */
.export-section {
margin-bottom: 20px;
padding: 15px;
background-color: #e8f5e9; /* 统一导出区域背景色调 */
border-radius: 5px;
text-align: center;
}
.export-btn {
padding: 10px 20px;
background-color: #4caf50; /* 统一按钮背景色调 */
color: white;
border: none;
border-radius: 4px;
cursor: pointer;
font-size: 16px;
margin: 0 5px;
}
.export-btn:hover {
background-color: #388e3c; /* 统一按钮悬停色调 */
}
.export-btn:disabled {
background-color: #9e9e9e; /* 统一禁用按钮色调 */
cursor: not-allowed;
}
.article-checkbox {
margin-right: 10px;
}
</style>
</head>
<body>
<div class="container">
<h1>绿色课堂文章列表</h1>
<ul>
{% for article in page_obj %}
<li>
<a href="{% url 'article_detail' article.id %}">{{ article.title }}</a>
({{ article.created_at|date:"Y-m-d" }})
</li>
{% empty %}
<li>暂无文章</li>
{% endfor %}
</ul>
<div class="pagination">
{% if page_obj.has_previous %}
<a href="?page={{ page_obj.previous_page_number }}">上一页</a>
{% endif %}
<span>第 {{ page_obj.number }} 页,共 {{ page_obj.paginator.num_pages }} 页</span>
{% if page_obj.has_next %}
<a href="?page={{ page_obj.next_page_number }}">下一页</a>
{% endif %}
<!-- 新增:搜索表单 -->
<div class="search-form">
<form method="get">
<input type="text" name="q" placeholder="输入关键词搜索文章..." value="{{ search_query }}">
{% if selected_website %}
<input type="hidden" name="website" value="{{ selected_website.id }}">
{% endif %}
<input type="submit" value="搜索">
</form>
</div>
<div class="content-wrapper">
<!-- 左侧筛选栏 -->
<div class="sidebar">
<div class="filters">
<strong>按网站筛选:</strong>
<a href="{% url 'article_list' %}{% if search_query %}?q={{ search_query }}{% endif %}"
{% if not selected_website %}class="active" {% endif %}>全部</a>
{% for website in websites %}
<a href="?website={{ website.id }}{% if search_query %}&q={{ search_query }}{% endif %}"
{% if selected_website and selected_website.id == website.id %}class="active" {% endif %}>{{ website.name }}</a>
{% endfor %}
</div>
<!-- 修改:按媒体类型筛选 -->
<div class="filters">
<strong>按媒体类型筛选:</strong>
<a href="?{% if selected_website %}website={{ selected_website.id }}&{% endif %}{% if search_query %}q={{ search_query }}&{% endif %}media_type=all"
{% if not request.GET.media_type or request.GET.media_type == 'all' %}class="active"{% endif %}>全部</a>
<a href="?{% if selected_website %}website={{ selected_website.id }}&{% endif %}{% if search_query %}q={{ search_query }}&{% endif %}media_type=text_only"
{% if request.GET.media_type == 'text_only' %}class="active"{% endif %}>纯文本</a>
<a href="?{% if selected_website %}website={{ selected_website.id }}&{% endif %}{% if search_query %}q={{ search_query }}&{% endif %}media_type=with_images"
{% if request.GET.media_type == 'with_images' %}class="active"{% endif %}>图片</a>
<a href="?{% if selected_website %}website={{ selected_website.id }}&{% endif %}{% if search_query %}q={{ search_query }}&{% endif %}media_type=with_videos"
{% if request.GET.media_type == 'with_videos' %}class="active"{% endif %}>视频</a>
</div>
</div>
<!-- 主内容区域 -->
<div class="main-content">
<!-- 新增:搜索结果信息 -->
{% if search_query %}
<div class="search-info">
搜索 "{{ search_query }}" 找到 {{ page_obj.paginator.count }} 篇文章
<a href="{% if selected_website %}?website={{ selected_website.id }}{% else %}{% url 'article_list' %}{% endif %}">清除搜索</a>
</div>
{% endif %}
<!-- 新增:导出功能 -->
<div class="export-section">
<button id="selectAllBtn" class="export-btn">全选</button>
<button id="deselectAllBtn" class="export-btn">取消全选</button>
<button id="exportJsonBtn" class="export-btn" disabled>导出为JSON</button>
<button id="exportCsvBtn" class="export-btn" disabled>导出为CSV</button>
<!-- 新增:导出为ZIP包按钮 -->
<button id="exportZipBtn" class="export-btn" disabled>导出为ZIP包</button>
<!-- 删除:按类型导出按钮 -->
<!-- <button id="exportTextOnlyBtn" class="export-btn">导出纯文本</button>
<button id="exportWithImagesBtn" class="export-btn">导出含图片</button>
<button id="exportWithVideosBtn" class="export-btn">导出含视频</button> -->
</div>
<ul>
{% for article in page_obj %}
<li>
<input type="checkbox" class="article-checkbox" value="{{ article.id }}"
id="article_{{ article.id }}">
<a href="{% url 'article_detail' article.id %}">{{ article.title }}</a>
<div class="meta">({{ article.website.name }} - {{ article.created_at|date:"Y-m-d" }})</div>
</li>
{% empty %}
<li>暂无文章</li>
{% endfor %}
</ul>
<div class="pagination">
{% if page_obj.has_previous %}
{% if selected_website %}
<a href="?website=
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page=1">&laquo;
首页</a>
<a href="?website=
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.previous_page_number }}">上一页</a>
{% else %}
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page=1">&laquo; 首页</a>
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.previous_page_number }}">上一页</a>
{% endif %}
{% endif %}
<span>第 {{ page_obj.number }} 页,共 {{ page_obj.paginator.num_pages }} 页</span>
<!-- 修改:优化页码显示逻辑 -->
{% with page_obj.paginator as paginator %}
{% for num in paginator.page_range %}
{% if page_obj.number == num %}
<a href="#" class="current">{{ num }}</a>
{% elif num > page_obj.number|add:'-3' and num < page_obj.number|add:'3' %}
{% if selected_website %}
<a href="?website=
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ num }}">{{ num }}</a>
{% else %}
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ num }}">{{ num }}</a>
{% endif %}
{% elif num == 1 or num == paginator.num_pages %}
{% if selected_website %}
<a href="?website=
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ num }}">{{ num }}</a>
{% else %}
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ num }}">{{ num }}</a>
{% endif %}
{% elif num == page_obj.number|add:'-3' or num == page_obj.number|add:'3' %}
<span class="ellipsis">...</span>
{% endif %}
{% endfor %}
{% endwith %}
{% if page_obj.has_next %}
{% if selected_website %}
<a href="?website=
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.next_page_number }}">下一页</a>
<a href="?website=
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.paginator.num_pages }}">末页
&raquo;</a>
{% else %}
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.next_page_number }}">下一页</a>
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.paginator.num_pages }}">末页
&raquo;</a>
{% endif %}
{% endif %}
</div>
</div>
</div>
</div>
<script>
// 导出功能相关JavaScript
const checkboxes = document.querySelectorAll('.article-checkbox');
const exportJsonBtn = document.getElementById('exportJsonBtn');
const exportCsvBtn = document.getElementById('exportCsvBtn');
const selectAllBtn = document.getElementById('selectAllBtn');
const deselectAllBtn = document.getElementById('deselectAllBtn');
// 新增:获取ZIP导出按钮元素
const exportZipBtn = document.getElementById('exportZipBtn');
// const exportTextOnlyBtn = document.getElementById('exportTextOnlyBtn');
// const exportWithImagesBtn = document.getElementById('exportWithImagesBtn');
// const exportWithVideosBtn = document.getElementById('exportWithVideosBtn');
// 更新导出按钮状态
function updateExportButtons() {
const selectedCount = document.querySelectorAll('.article-checkbox:checked').length;
exportJsonBtn.disabled = selectedCount === 0;
exportCsvBtn.disabled = selectedCount === 0;
exportZipBtn.disabled = selectedCount === 0; // 新增:更新ZIP导出按钮状态
}
// 为所有复选框添加事件监听器
checkboxes.forEach(checkbox => {
checkbox.addEventListener('change', updateExportButtons);
});
// 全选功能
selectAllBtn.addEventListener('click', () => {
checkboxes.forEach(checkbox => {
checkbox.checked = true;
});
updateExportButtons();
});
// 取消全选功能
deselectAllBtn.addEventListener('click', () => {
checkboxes.forEach(checkbox => {
checkbox.checked = false;
});
updateExportButtons();
});
// 导出为JSON功能
exportJsonBtn.addEventListener('click', () => {
const selectedArticles = Array.from(document.querySelectorAll('.article-checkbox:checked'))
.map(checkbox => checkbox.value);
// 发送POST请求导出文章
fetch('{% url "export_articles" %}', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'X-CSRFToken': '{{ csrf_token }}'
},
body: JSON.stringify({
article_ids: selectedArticles,
format: 'json'
})
})
.then(response => {
if (response.ok) {
return response.blob();
}
throw new Error('导出失败');
})
.then(blob => {
const url = window.URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = 'articles.json';
document.body.appendChild(a);
a.click();
window.URL.revokeObjectURL(url);
document.body.removeChild(a);
})
.catch(error => {
alert('导出失败: ' + error);
});
});
// 导出为CSV功能
exportCsvBtn.addEventListener('click', () => {
const selectedArticles = Array.from(document.querySelectorAll('.article-checkbox:checked'))
.map(checkbox => checkbox.value);
// 发送POST请求导出文章
fetch('{% url "export_articles" %}', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'X-CSRFToken': '{{ csrf_token }}'
},
body: JSON.stringify({
article_ids: selectedArticles,
format: 'csv'
})
})
.then(response => {
if (response.ok) {
return response.blob();
}
throw new Error('导出失败');
})
.then(blob => {
const url = window.URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = 'articles.csv';
document.body.appendChild(a);
a.click();
window.URL.revokeObjectURL(url);
document.body.removeChild(a);
})
.catch(error => {
alert('导出失败: ' + error);
});
});
// 新增:导出为ZIP包功能
exportZipBtn.addEventListener('click', () => {
const selectedArticles = Array.from(document.querySelectorAll('.article-checkbox:checked'))
.map(checkbox => checkbox.value);
// 发送POST请求导出文章为ZIP包
fetch('{% url "export_articles" %}', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'X-CSRFToken': '{{ csrf_token }}'
},
body: JSON.stringify({
article_ids: selectedArticles,
format: 'zip' // 指定导出格式为ZIP
})
})
.then(response => {
if (response.ok) {
return response.blob();
}
throw new Error('导出失败');
})
.then(blob => {
const url = window.URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = 'articles.zip';
document.body.appendChild(a);
a.click();
window.URL.revokeObjectURL(url);
document.body.removeChild(a);
})
.catch(error => {
alert('导出失败: ' + error);
});
});
// exportTextOnlyBtn.addEventListener('click', () => {
// exportByMediaType('text_only');
// });
// exportWithImagesBtn.addEventListener('click', () => {
// exportByMediaType('with_images');
// });
// exportWithVideosBtn.addEventListener('click', () => {
// exportByMediaType('with_videos');
// });
// function exportByMediaType(mediaType) {
// // 发送POST请求按类型导出文章
// fetch('{% url "export_articles_by_type" %}', {
// method: 'POST',
// headers: {
// 'Content-Type': 'application/json',
// 'X-CSRFToken': '{{ csrf_token }}'
// },
// body: JSON.stringify({
// media_type: mediaType,
// format: 'zip'
// })
// })
// .then(response => {
// if (response.ok) {
// return response.blob();
// }
// throw new Error('导出失败');
// })
// .then(blob => {
// const url = window.URL.createObjectURL(blob);
// const a = document.createElement('a');
// a.href = url;
// a.download = `articles_${mediaType}.zip`;
// document.body.appendChild(a);
// a.click();
// window.URL.revokeObjectURL(url);
// document.body.removeChild(a);
// })
// .catch(error => {
// alert('导出失败: ' + error);
// });
// }
// 初始化导出按钮状态
updateExportButtons();
</script>
</body>
</html>

View File

View File

@@ -0,0 +1,46 @@
from django import template
from django.core.cache import cache
from core.models import CrawlTask
register = template.Library()
@register.simple_tag
def get_recent_tasks(limit=5):
"""获取最近的任务"""
cache_key = f'recent_tasks_{limit}'
recent_tasks = cache.get(cache_key)
if recent_tasks is None:
recent_tasks = CrawlTask.objects.all()[:limit]
cache.set(cache_key, recent_tasks, 60) # 缓存1分钟
return recent_tasks
@register.filter
def task_status_color(status):
"""根据任务状态返回颜色"""
color_map = {
'pending': 'gray',
'running': 'blue',
'completed': 'green',
'failed': 'red',
'cancelled': 'orange',
}
return color_map.get(status, 'gray')
@register.filter
def task_progress_bar(progress):
"""生成进度条HTML"""
if progress is None:
progress = 0
return f'''
<div style="width: 100px; background-color: #f0f0f0; border-radius: 3px; overflow: hidden;">
<div style="width: {progress}%; background-color: #4CAF50; height: 16px; text-align: center; line-height: 16px; color: white; font-size: 12px;">
{progress}%
</div>
</div>
'''

View File

@@ -1,3 +1,312 @@
from django.test import TestCase
import os
import tempfile
import shutil
from django.test import TestCase, override_settings
from django.core.management import call_command
from django.core.management.base import CommandError
from django.utils import timezone
from django.core.files.uploadedfile import SimpleUploadedFile
from unittest.mock import patch, MagicMock
from .models import Website, Article
from .utils import process_article, download_media, is_valid_url, full_site_crawler
from .tasks import crawl_website, crawl_all_websites, health_check
# Create your tests here.
class WebsiteModelTest(TestCase):
"""网站模型测试"""
def setUp(self):
self.website = Website.objects.create(
name='测试网站',
base_url='https://test.com',
description='测试描述'
)
def test_website_creation(self):
"""测试网站创建"""
self.assertEqual(self.website.name, '测试网站')
self.assertEqual(self.website.base_url, 'https://test.com')
self.assertTrue(self.website.enabled)
def test_website_str(self):
"""测试网站字符串表示"""
self.assertEqual(str(self.website), '测试网站')
class ArticleModelTest(TestCase):
"""文章模型测试"""
def setUp(self):
self.website = Website.objects.create(
name='测试网站',
base_url='https://test.com'
)
self.article = Article.objects.create(
website=self.website,
title='测试文章',
url='https://test.com/article/1',
content='<p>测试内容</p>',
media_files=['image1.jpg', 'image2.jpg']
)
def test_article_creation(self):
"""测试文章创建"""
self.assertEqual(self.article.title, '测试文章')
self.assertEqual(self.article.url, 'https://test.com/article/1')
self.assertEqual(len(self.article.media_files), 2)
def test_article_str(self):
"""测试文章字符串表示"""
self.assertEqual(str(self.article), '测试文章')
class UtilsTest(TestCase):
"""工具函数测试"""
def setUp(self):
self.website = Website.objects.create(
name='测试网站',
base_url='https://test.com'
)
self.temp_dir = tempfile.mkdtemp()
def tearDown(self):
shutil.rmtree(self.temp_dir)
def test_is_valid_url(self):
"""测试URL验证"""
from .utils import is_valid_url
# 有效URL
self.assertTrue(is_valid_url('https://test.com/article', 'test.com'))
self.assertTrue(is_valid_url('http://test.com/article', 'test.com'))
# 无效URL
self.assertFalse(is_valid_url('https://other.com/article', 'test.com'))
self.assertFalse(is_valid_url('ftp://test.com/article', 'test.com'))
self.assertFalse(is_valid_url('invalid-url', 'test.com'))
@patch('core.utils.requests.get')
def test_download_media(self, mock_get):
"""测试媒体下载"""
# 模拟响应
mock_response = MagicMock()
mock_response.content = b'fake image content'
mock_response.headers = {'content-type': 'image/jpeg'}
mock_get.return_value = mock_response
# 测试下载
result = download_media('https://test.com/image.jpg', self.temp_dir)
self.assertIsNotNone(result)
self.assertTrue(os.path.exists(result))
@patch('core.utils.requests.get')
@patch('core.utils.download_media')
def test_process_article_success(self, mock_download_media, mock_get):
"""测试文章处理成功"""
# 模拟HTML响应
html_content = '''
<html>
<head><title>测试文章</title></head>
<body>
<h1>测试文章标题</h1>
<div class="content">
<p>测试文章内容</p>
<img src="https://test.com/image.jpg">
</div>
</body>
</html>
'''
mock_response = MagicMock()
mock_response.text = html_content
mock_response.encoding = 'utf-8'
mock_response.raise_for_status.return_value = None
mock_get.return_value = mock_response
# 模拟媒体下载
mock_download_media.return_value = '/tmp/test_image.jpg'
# 测试文章处理
process_article('https://test.com/article/1', self.website)
# 验证文章是否保存
article = Article.objects.filter(url='https://test.com/article/1').first()
self.assertIsNotNone(article)
self.assertEqual(article.title, '测试文章标题')
class ManagementCommandsTest(TestCase):
"""管理命令测试"""
def setUp(self):
self.website = Website.objects.create(
name='测试网站',
base_url='https://test.com'
)
@patch('core.management.commands.crawl_all_media.call_command')
def test_crawl_all_media_command(self, mock_call_command):
"""测试批量爬取命令"""
# 模拟命令执行
mock_call_command.return_value = None
# 执行命令
call_command('crawl_all_media', media='rmrb,xinhua')
# 验证命令被调用
mock_call_command.assert_called()
class CeleryTasksTest(TestCase):
"""Celery任务测试"""
def setUp(self):
self.website = Website.objects.create(
name='测试网站',
base_url='https://test.com'
)
@patch('core.tasks.full_site_crawler')
def test_crawl_website_task(self, mock_crawler):
"""测试单个网站爬取任务"""
# 模拟爬虫函数
mock_crawler.return_value = None
# 执行任务
result = crawl_website(self.website.id)
# 验证结果
self.assertEqual(result['website_id'], self.website.id)
self.assertEqual(result['website_name'], '测试网站')
self.assertEqual(result['status'], 'success')
def test_crawl_website_task_invalid_id(self):
"""测试无效网站ID的任务"""
# 执行任务
with self.assertRaises(Exception):
crawl_website(99999)
@patch('core.tasks.crawl_website.delay')
def test_crawl_all_websites_task(self, mock_delay):
"""测试批量爬取任务"""
# 模拟子任务
mock_result = MagicMock()
mock_result.id = 'task-123'
mock_delay.return_value = mock_result
# 执行任务
result = crawl_all_websites()
# 验证结果
self.assertEqual(result['total_websites'], 1)
self.assertEqual(result['status'], 'started')
def test_health_check_task(self):
"""测试健康检查任务"""
# 执行任务
result = health_check()
# 验证结果
self.assertEqual(result['database'], 'ok')
self.assertEqual(result['website_count'], 1)
self.assertEqual(result['article_count'], 0)
class IntegrationTest(TestCase):
"""集成测试"""
def setUp(self):
self.website = Website.objects.create(
name='集成测试网站',
base_url='https://integration-test.com'
)
def test_full_workflow(self):
"""测试完整工作流程"""
# 1. 创建网站
self.assertEqual(Website.objects.count(), 1)
# 2. 创建文章
article = Article.objects.create(
website=self.website,
title='集成测试文章',
url='https://integration-test.com/article/1',
content='<p>集成测试内容</p>'
)
# 3. 验证关联关系
self.assertEqual(article.website, self.website)
self.assertEqual(self.website.article_set.count(), 1)
# 4. 验证数据完整性
self.assertIsNotNone(article.created_at)
self.assertIsInstance(article.media_files, list)
@override_settings(MEDIA_ROOT=tempfile.mkdtemp())
class MediaHandlingTest(TestCase):
"""媒体文件处理测试"""
def setUp(self):
self.website = Website.objects.create(
name='媒体测试网站',
base_url='https://media-test.com'
)
def test_media_files_field(self):
"""测试媒体文件字段"""
article = Article.objects.create(
website=self.website,
title='媒体测试文章',
url='https://media-test.com/article/1',
content='<p>测试内容</p>',
media_files=['image1.jpg', 'video1.mp4']
)
# 验证媒体文件列表
self.assertEqual(len(article.media_files), 2)
self.assertIn('image1.jpg', article.media_files)
self.assertIn('video1.mp4', article.media_files)
class ErrorHandlingTest(TestCase):
"""错误处理测试"""
def test_duplicate_url_handling(self):
"""测试重复URL处理"""
website = Website.objects.create(
name='错误测试网站',
base_url='https://error-test.com'
)
# 创建第一篇文章
article1 = Article.objects.create(
website=website,
title='第一篇文章',
url='https://error-test.com/article/1',
content='<p>内容1</p>'
)
# 尝试创建相同URL的文章
with self.assertRaises(Exception):
Article.objects.create(
website=website,
title='第二篇文章',
url='https://error-test.com/article/1', # 相同URL
content='<p>内容2</p>'
)
def test_invalid_website_data(self):
"""测试无效网站数据"""
# 测试重复名称unique约束
Website.objects.create(
name='测试网站1',
base_url='https://test1.com'
)
with self.assertRaises(Exception):
Website.objects.create(
name='测试网站1', # 重复名称
base_url='https://test2.com'
)

View File

@@ -2,9 +2,11 @@ from django.urls import path
from . import views
urlpatterns = [
# 主页,文章列表
path('', views.article_list, name='article_list'),
# 文章详情
path('article/<int:article_id>/', views.article_detail, name='article_detail'),
# 后续可以加更多路径
path('run-crawler/', views.run_crawler, name='run_crawler'),
path('crawler-status/', views.crawler_status, name='crawler_status'),
path('pause-crawler/', views.pause_crawler, name='pause_crawler'),
path('export-articles/', views.export_articles, name='export_articles'),
path('export-articles-by-type/', views.export_articles_by_type, name='export_articles_by_type'),
]

File diff suppressed because it is too large Load Diff

View File

@@ -1,28 +1,636 @@
from django.shortcuts import render, get_object_or_404
import uuid
from django.shortcuts import render
from django.core.paginator import Paginator
from .models import Article
from django.http import JsonResponse
from django.views.decorators.http import require_http_methods
from django.core.management import call_command
from .models import Article, Website
import threading
from django.http import HttpResponse
import json
import csv
from django.views.decorators.csrf import csrf_exempt
from django.utils import timezone
# 用于跟踪爬虫任务状态的全局字典
crawler_tasks = {}
def article_list(request):
"""
显示文章列表的视图函数
"""
articles = Article.objects.all().order_by('-created_at')
paginator = Paginator(articles, 20) # 每页显示10篇文章
# 获取所有启用的网站
websites = Website.objects.filter(enabled=True)
# 获取筛选网站
selected_website = None
# 修改:确保始终获取所有文章,除非有特定筛选
articles = Article.objects.all()
website_id = request.GET.get('website')
if website_id:
try:
selected_website = Website.objects.get(id=website_id)
articles = articles.filter(website=selected_website)
except Website.DoesNotExist:
pass
# 处理关键词搜索
search_query = request.GET.get('q')
if search_query:
articles = articles.filter(title__icontains=search_query)
# 新增:处理媒体类型筛选
media_type = request.GET.get('media_type', 'all')
if media_type == 'text_only':
# 纯文本文章(没有媒体文件)
articles = articles.filter(media_files__isnull=True) | articles.filter(media_files=[])
elif media_type == 'with_images':
# 包含图片的文章
articles = articles.filter(media_files__icontains='.jpg') | \
articles.filter(media_files__icontains='.jpeg') | \
articles.filter(media_files__icontains='.png') | \
articles.filter(media_files__icontains='.gif')
elif media_type == 'with_videos':
# 包含视频的文章
articles = articles.filter(media_files__icontains='.mp4') | \
articles.filter(media_files__icontains='.avi') | \
articles.filter(media_files__icontains='.mov') | \
articles.filter(media_files__icontains='.wmv') | \
articles.filter(media_files__icontains='.flv') | \
articles.filter(media_files__icontains='.webm')
# 按创建时间倒序排列
articles = articles.order_by('-created_at')
# 分页
paginator = Paginator(articles, 40) # 每页显示10篇文章
page_number = request.GET.get('page')
page_obj = paginator.get_page(page_number)
return render(request, 'core/article_list.html', {
'page_obj': page_obj
'page_obj': page_obj,
'websites': websites,
'selected_website': selected_website,
'search_query': search_query
})
def article_detail(request, article_id):
"""
显示文章详情的视图函数
"""
article = get_object_or_404(Article, id=article_id)
return render(request, 'core/article_detail.html', {
'article': article
})
article = Article.objects.get(id=article_id)
return render(request, 'core/article_detail.html', {'article': article})
# Create your views here.
# 添加任务ID生成和状态跟踪
@require_http_methods(["POST"])
def run_crawler(request):
"""
从前台触发爬虫任务
"""
try:
# 获取要执行的爬虫名称
crawler_name = request.POST.get('crawler_name', '')
if not crawler_name:
return JsonResponse({'status': 'error', 'message': '爬虫名称不能为空'})
# 生成任务ID
task_id = str(uuid.uuid4())
# 记录任务开始前的文章数量
initial_count = Article.objects.count()
# 在后台线程中运行爬虫任务
def run_spider():
try:
# 更新任务状态为运行中
crawler_tasks[task_id] = {
'status': 'running',
'message': '爬虫正在运行...',
'start_time': timezone.now(),
'initial_count': initial_count
}
# 根据爬虫名称调用相应的命令
if crawler_name in ['crawl_xinhua', 'crawl_dongfangyancao']:
call_command(crawler_name)
else:
# 如果是通用爬虫命令使用crawl_articles
call_command('crawl_articles', crawler_name)
# 计算新增文章数量
final_count = Article.objects.count()
added_count = final_count - initial_count
# 更新任务状态为完成
crawler_tasks[task_id] = {
'status': 'completed',
'message': f'爬虫已完成,新增 {added_count} 篇文章',
'added_count': added_count,
'end_time': timezone.now()
}
except Exception as e:
# 修改:改进错误处理,提供更友好的错误信息
error_msg = str(e)
if "UNIQUE constraint failed" in error_msg and "core_article.url" in error_msg:
error_msg = "检测到重复文章URL已跳过重复项"
else:
print(f"爬虫执行出错: {e}")
# 计算实际新增文章数量(即使有错误也统计)
final_count = Article.objects.count()
added_count = final_count - initial_count
# 更新任务状态为完成(即使有部分错误)
crawler_tasks[task_id] = {
'status': 'completed',
'message': f'爬虫已完成,新增 {added_count} 篇文章。{error_msg}',
'added_count': added_count,
'end_time': timezone.now(),
'error': error_msg
}
# 启动后台线程执行爬虫
thread = threading.Thread(target=run_spider)
thread.daemon = True
thread.start()
return JsonResponse({'status': 'success', 'message': f'爬虫 {crawler_name} 已启动', 'task_id': task_id})
except Exception as e:
return JsonResponse({'status': 'error', 'message': str(e)})
# 检查爬虫状态的视图
@require_http_methods(["POST"])
def crawler_status(request):
"""
检查爬虫任务状态
"""
try:
task_id = request.POST.get('task_id', '')
if not task_id:
return JsonResponse({'status': 'error', 'message': '任务ID不能为空'})
# 获取任务状态
task_info = crawler_tasks.get(task_id)
if not task_info:
return JsonResponse({'status': 'error', 'message': '未找到任务'})
return JsonResponse(task_info)
except Exception as e:
return JsonResponse({'status': 'error', 'message': str(e)})
# 新增:暂停爬虫的视图
@require_http_methods(["POST"])
def pause_crawler(request):
"""
暂停爬虫任务
"""
try:
task_id = request.POST.get('task_id', '')
if not task_id:
return JsonResponse({'status': 'error', 'message': '任务ID不能为空'})
# 获取任务状态
task_info = crawler_tasks.get(task_id)
if not task_info:
return JsonResponse({'status': 'error', 'message': '未找到任务'})
# 在实际应用中,这里应该实现真正的暂停逻辑
# 目前我们只是更新任务状态来模拟暂停功能
task_info['status'] = 'paused'
task_info['message'] = '爬虫已暂停'
return JsonResponse({
'status': 'success',
'message': '爬虫已暂停',
'progress': 0 # 这里应该返回实际进度
})
except Exception as e:
return JsonResponse({'status': 'error', 'message': str(e)})
# 新增:文章导出视图
@csrf_exempt
@require_http_methods(["POST"])
def export_articles(request):
try:
# 解析请求数据
data = json.loads(request.body)
article_ids = data.get('article_ids', [])
format_type = data.get('format', 'json')
# 获取选中的文章
articles = Article.objects.filter(id__in=article_ids)
if not articles.exists():
return HttpResponse('没有选中文章', status=400)
# 根据格式类型导出
if format_type == 'json':
# 准备JSON数据
articles_data = []
for article in articles:
articles_data.append({
'id': article.id,
'title': article.title,
'website': article.website.name,
'url': article.url,
'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None,
'content': article.content,
'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'),
'media_files': article.media_files
})
# 创建JSON响应
response = HttpResponse(
json.dumps(articles_data, ensure_ascii=False, indent=2),
content_type='application/json'
)
response['Content-Disposition'] = 'attachment; filename="articles.json"'
return response
elif format_type == 'csv':
# 创建CSV响应
response = HttpResponse(content_type='text/csv')
response['Content-Disposition'] = 'attachment; filename="articles.csv"'
# 创建CSV写入器
writer = csv.writer(response)
writer.writerow(['ID', '标题', '网站', 'URL', '发布时间', '内容', '创建时间', '媒体文件'])
# 写入文章数据
for article in articles:
writer.writerow([
article.id,
article.title,
article.website.name,
article.url,
article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else '',
article.content,
article.created_at.strftime('%Y-%m-%d %H:%M:%S'),
';'.join(article.media_files) if article.media_files else ''
])
return response
# 新增:支持ZIP格式导出
elif format_type == 'zip':
import zipfile
from io import BytesIO
from django.conf import settings
import os
# 创建内存中的ZIP文件
zip_buffer = BytesIO()
with zipfile.ZipFile(zip_buffer, 'w') as zip_file:
# 为每篇文章创建Word文档并添加到ZIP文件中
for article in articles:
# 为每篇文章创建单独的文件夹
article_folder = f"article_{article.id}_{article.title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('*', '_').replace('?', '_').replace('"', '_').replace('<', '_').replace('>', '_').replace('|', '_')}"
# 创建文章数据
article_data = {
'id': article.id,
'title': article.title,
'website': article.website.name,
'url': article.url,
'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None,
'content': article.content,
'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'),
'media_files': article.media_files
}
# 将文章数据保存为Word文件并添加到ZIP
try:
from docx import Document
from docx.shared import Inches
from io import BytesIO
from bs4 import BeautifulSoup
import requests
# 创建Word文档
doc = Document()
doc.add_heading(article.title, 0)
# 添加文章元数据
doc.add_paragraph(f"网站: {article.website.name}")
doc.add_paragraph(f"URL: {article.url}")
doc.add_paragraph(
f"发布时间: {article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else 'N/A'}")
doc.add_paragraph(f"创建时间: {article.created_at.strftime('%Y-%m-%d %H:%M:%S')}")
# 添加文章内容
doc.add_heading('内容', level=1)
# 处理HTML内容
soup = BeautifulSoup(article.content, 'html.parser')
# 处理内容中的图片
for img in soup.find_all('img'):
src = img.get('src', '')
if src:
try:
# 构建完整的图片路径
if src.startswith('http'):
# 网络图片
response = requests.get(src, timeout=10)
image_stream = BytesIO(response.content)
doc.add_picture(image_stream, width=Inches(4.0))
else:
# 本地图片
full_path = os.path.join(settings.MEDIA_ROOT, src.lstrip('/'))
if os.path.exists(full_path):
doc.add_picture(full_path, width=Inches(4.0))
except Exception as e:
# 如果添加图片失败添加图片URL作为文本
doc.add_paragraph(f"[图片: {src}]")
# 移除原始img标签
img.decompose()
content_text = soup.get_text()
doc.add_paragraph(content_text)
# 添加媒体文件信息
if article.media_files:
doc.add_heading('媒体文件', level=1)
for media_file in article.media_files:
try:
full_path = os.path.join(settings.MEDIA_ROOT, media_file)
if os.path.exists(full_path):
# 检查文件扩展名以确定处理方式
file_extension = os.path.splitext(media_file)[1].lower()
# 图片文件处理
if file_extension in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']:
doc.add_picture(full_path, width=Inches(4.0))
# 视频文件处理
elif file_extension in ['.mp4', '.avi', '.mov', '.wmv', '.flv', '.webm']:
doc.add_paragraph(f"[视频文件: {media_file}]")
# 其他文件类型
else:
doc.add_paragraph(f"[文件: {media_file}]")
else:
# 如果是URL格式的媒体文件
if media_file.startswith('http'):
response = requests.get(media_file, timeout=10)
file_extension = os.path.splitext(media_file)[1].lower()
# 图片文件处理
if file_extension in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']:
image_stream = BytesIO(response.content)
doc.add_picture(image_stream, width=Inches(4.0))
else:
doc.add_paragraph(f"[文件: {media_file}]")
else:
doc.add_paragraph(media_file)
except Exception as e:
doc.add_paragraph(media_file)
# 保存Word文档到内存
doc_buffer = BytesIO()
doc.save(doc_buffer)
doc_buffer.seek(0)
# 将Word文档添加到ZIP包
zip_file.writestr(os.path.join(article_folder, f'{article.title.replace("/", "_")}.docx'),
doc_buffer.read())
except ImportError:
# 如果没有安装python-docx库回退到JSON格式
json_data = json.dumps(article_data, ensure_ascii=False, indent=2)
zip_file.writestr(os.path.join(article_folder, f'{article.title.replace("/", "_")}.json'),
json_data)
# 添加媒体文件到ZIP包
if article.media_files:
for media_file in article.media_files:
try:
full_path = os.path.join(settings.MEDIA_ROOT, media_file)
if os.path.exists(full_path):
# 添加文件到ZIP包
zip_file.write(full_path, os.path.join(article_folder, 'media', media_file))
else:
# 如果是URL格式的媒体文件
if media_file.startswith('http'):
import requests
response = requests.get(media_file, timeout=10)
zip_file.writestr(
os.path.join(article_folder, 'media', os.path.basename(media_file)),
response.content)
except Exception as e:
# 如果添加媒体文件失败,继续处理其他文件
pass
# 创建HttpResponse
zip_buffer.seek(0)
response = HttpResponse(zip_buffer.getvalue(), content_type='application/zip')
response['Content-Disposition'] = 'attachment; filename=articles_export.zip'
return response
else:
return HttpResponse('不支持的格式', status=400)
except Exception as e:
return HttpResponse(f'导出失败: {str(e)}', status=500)
# 新增:按媒体类型导出文章视图
@csrf_exempt
@require_http_methods(["POST"])
def export_articles_by_type(request):
try:
# 解析请求数据
data = json.loads(request.body)
media_type = data.get('media_type', 'all')
format_type = data.get('format', 'zip')
# 根据媒体类型筛选文章
if media_type == 'text_only':
# 纯文本文章(没有媒体文件或媒体文件为空)
articles = Article.objects.filter(media_files__isnull=True) | Article.objects.filter(media_files=[])
elif media_type == 'with_images':
# 包含图片的文章
articles = Article.objects.filter(media_files__icontains='.jpg') | \
Article.objects.filter(media_files__icontains='.jpeg') | \
Article.objects.filter(media_files__icontains='.png') | \
Article.objects.filter(media_files__icontains='.gif')
elif media_type == 'with_videos':
# 包含视频的文章
articles = Article.objects.filter(media_files__icontains='.mp4') | \
Article.objects.filter(media_files__icontains='.avi') | \
Article.objects.filter(media_files__icontains='.mov') | \
Article.objects.filter(media_files__icontains='.wmv') | \
Article.objects.filter(media_files__icontains='.flv') | \
Article.objects.filter(media_files__icontains='.webm')
else:
# 所有文章
articles = Article.objects.all()
# 去重处理
articles = articles.distinct()
if not articles.exists():
return HttpResponse('没有符合条件的文章', status=400)
# 导出为ZIP格式
if format_type == 'zip':
import zipfile
from io import BytesIO
from django.conf import settings
import os
# 创建内存中的ZIP文件
zip_buffer = BytesIO()
with zipfile.ZipFile(zip_buffer, 'w') as zip_file:
# 为每篇文章创建Word文档并添加到ZIP文件中
for article in articles:
# 为每篇文章创建单独的文件夹
article_folder = f"article_{article.id}_{article.title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('*', '_').replace('?', '_').replace('"', '_').replace('<', '_').replace('>', '_').replace('|', '_')}"
# 创建文章数据
article_data = {
'id': article.id,
'title': article.title,
'website': article.website.name,
'url': article.url,
'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None,
'content': article.content,
'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'),
'media_files': article.media_files
}
# 将文章数据保存为Word文件并添加到ZIP
try:
from docx import Document
from docx.shared import Inches
from io import BytesIO
from bs4 import BeautifulSoup
import requests
# 创建Word文档
doc = Document()
doc.add_heading(article.title, 0)
# 添加文章元数据
doc.add_paragraph(f"网站: {article.website.name}")
doc.add_paragraph(f"URL: {article.url}")
doc.add_paragraph(
f"发布时间: {article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else 'N/A'}")
doc.add_paragraph(f"创建时间: {article.created_at.strftime('%Y-%m-%d %H:%M:%S')}")
# 添加文章内容
doc.add_heading('内容', level=1)
# 处理HTML内容
soup = BeautifulSoup(article.content, 'html.parser')
# 处理内容中的图片
for img in soup.find_all('img'):
src = img.get('src', '')
if src:
try:
# 构建完整的图片路径
if src.startswith('http'):
# 网络图片
response = requests.get(src, timeout=10)
image_stream = BytesIO(response.content)
doc.add_picture(image_stream, width=Inches(4.0))
else:
# 本地图片
full_path = os.path.join(settings.MEDIA_ROOT, src.lstrip('/'))
if os.path.exists(full_path):
doc.add_picture(full_path, width=Inches(4.0))
except Exception as e:
# 如果添加图片失败添加图片URL作为文本
doc.add_paragraph(f"[图片: {src}]")
# 移除原始img标签
img.decompose()
content_text = soup.get_text()
doc.add_paragraph(content_text)
# 添加媒体文件信息
if article.media_files:
doc.add_heading('媒体文件', level=1)
for media_file in article.media_files:
try:
full_path = os.path.join(settings.MEDIA_ROOT, media_file)
if os.path.exists(full_path):
# 检查文件扩展名以确定处理方式
file_extension = os.path.splitext(media_file)[1].lower()
# 图片文件处理
if file_extension in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']:
doc.add_picture(full_path, width=Inches(4.0))
# 视频文件处理
elif file_extension in ['.mp4', '.avi', '.mov', '.wmv', '.flv', '.webm']:
doc.add_paragraph(f"[视频文件: {media_file}]")
# 其他文件类型
else:
doc.add_paragraph(f"[文件: {media_file}]")
else:
# 如果是URL格式的媒体文件
if media_file.startswith('http'):
response = requests.get(media_file, timeout=10)
file_extension = os.path.splitext(media_file)[1].lower()
# 图片文件处理
if file_extension in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']:
image_stream = BytesIO(response.content)
doc.add_picture(image_stream, width=Inches(4.0))
else:
doc.add_paragraph(f"[文件: {media_file}]")
else:
doc.add_paragraph(media_file)
except Exception as e:
doc.add_paragraph(media_file)
# 保存Word文档到内存
doc_buffer = BytesIO()
doc.save(doc_buffer)
doc_buffer.seek(0)
# 将Word文档添加到ZIP包
zip_file.writestr(os.path.join(article_folder, f'{article.title.replace("/", "_")}.docx'),
doc_buffer.read())
except ImportError:
# 如果没有安装python-docx库回退到JSON格式
json_data = json.dumps(article_data, ensure_ascii=False, indent=2)
zip_file.writestr(os.path.join(article_folder, f'{article.title.replace("/", "_")}.json'),
json_data)
# 添加媒体文件到ZIP包
if article.media_files:
for media_file in article.media_files:
try:
full_path = os.path.join(settings.MEDIA_ROOT, media_file)
if os.path.exists(full_path):
# 添加文件到ZIP包
zip_file.write(full_path, os.path.join(article_folder, 'media', media_file))
else:
# 如果是URL格式的媒体文件
if media_file.startswith('http'):
import requests
response = requests.get(media_file, timeout=10)
zip_file.writestr(
os.path.join(article_folder, 'media', os.path.basename(media_file)),
response.content)
except Exception as e:
# 如果添加媒体文件失败,继续处理其他文件
pass
# 创建HttpResponse
zip_buffer.seek(0)
response = HttpResponse(zip_buffer.getvalue(), content_type='application/zip')
response['Content-Disposition'] = f'attachment; filename=articles_{media_type}.zip'
return response
else:
return HttpResponse('不支持的格式', status=400)
except Exception as e:
return HttpResponse(f'导出失败: {str(e)}', status=500)

710
crawler_engine.py Normal file
View File

@@ -0,0 +1,710 @@
import requests
import time
import re
import logging
import os
import urllib3
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from django.conf import settings
from django.utils import timezone
from django.core.files.base import ContentFile
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from .models import Website, CrawlTask, CrawledContent, CrawlLog, SearchKeyword, MediaFile
# 禁用SSL警告
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# 设置日志记录器
logger = logging.getLogger(__name__)
class WebsiteCrawler:
"""网站爬虫引擎"""
def __init__(self, task_id):
self.task = CrawlTask.objects.get(id=task_id)
self.keywords = [kw.strip() for kw in self.task.keywords.split(',') if kw.strip()]
# 创建带重试策略的会话
self.session = requests.Session()
self.session.headers.update({
'User-Agent': settings.CRAWLER_SETTINGS['USER_AGENT']
})
# 设置重试策略
retry_strategy = Retry(
total=settings.CRAWLER_SETTINGS.get('MAX_RETRIES', 3),
backoff_factor=1,
status_forcelist=[429, 500, 502, 503, 504],
)
adapter = HTTPAdapter(max_retries=retry_strategy)
self.session.mount("http://", adapter)
self.session.mount("https://", adapter)
# 设置超时
self.timeout = settings.CRAWLER_SETTINGS['TIMEOUT']
def log(self, level, message, website=None):
"""记录日志"""
CrawlLog.objects.create(
task=self.task,
website=website,
level=level,
message=message
)
# 同时记录到Python日志系统
logger.log(getattr(logging, level.upper()), f"Task {self.task.id}: {message}")
def update_task_status(self, status, **kwargs):
"""更新任务状态"""
self.task.status = status
if status == 'running' and not self.task.started_at:
self.task.started_at = timezone.now()
elif status in ['completed', 'failed', 'cancelled']:
self.task.completed_at = timezone.now()
for key, value in kwargs.items():
setattr(self.task, key, value)
self.task.save()
def extract_text_content(self, soup):
"""提取文本内容,保持段落结构"""
# 移除脚本和样式标签
for script in soup(["script", "style"]):
script.decompose()
# 处理段落标签,保持段落结构
paragraphs = []
# 查找所有段落相关的标签
for element in soup.find_all(['p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'br']):
if element.name in ['p', 'div']:
text = element.get_text().strip()
if text:
paragraphs.append(text)
elif element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
text = element.get_text().strip()
if text:
paragraphs.append(f"\n{text}\n") # 标题前后加换行
elif element.name == 'br':
paragraphs.append('\n')
# 如果没有找到段落标签,使用原来的方法
if not paragraphs:
text = soup.get_text()
# 清理文本但保持换行
lines = []
for line in text.splitlines():
line = line.strip()
if line:
lines.append(line)
return '\n\n'.join(lines)
# 合并段落,用双换行分隔
content = '\n\n'.join(paragraphs)
# 清理多余的空行
import re
content = re.sub(r'\n\s*\n\s*\n', '\n\n', content)
return content.strip()
def find_article_links(self, soup, base_url):
"""查找文章链接"""
links = []
# 常见的文章链接选择器
selectors = [
'a[href*="article"]',
'a[href*="news"]',
'a[href*="content"]',
'a[href*="detail"]',
'a[href*="view"]',
'a[href*="show"]',
'.news-list a',
'.article-list a',
'.content-list a',
'h3 a',
'h4 a',
'.title a',
'.list-item a'
]
for selector in selectors:
elements = soup.select(selector)
for element in elements:
href = element.get('href')
if href:
full_url = urljoin(base_url, href)
title = element.get_text().strip()
if title and len(title) > 5: # 过滤掉太短的标题
links.append({
'url': full_url,
'title': title
})
return links
def check_keyword_match(self, text, title):
"""检查关键字匹配"""
matched_keywords = []
text_lower = text.lower()
title_lower = title.lower()
for keyword in self.keywords:
keyword_lower = keyword.lower()
if keyword_lower in text_lower or keyword_lower in title_lower:
matched_keywords.append(keyword)
return matched_keywords
def extract_article_content(self, url, soup):
"""提取文章内容"""
# 尝试多种内容选择器
content_selectors = [
'.article-content',
'.content',
'.article-body',
'.news-content',
'.main-content',
'.post-content',
'article',
'.detail-content',
'#content',
'.text'
]
content = ""
for selector in content_selectors:
element = soup.select_one(selector)
if element:
content = self.extract_text_content(element)
if len(content) > 100: # 确保内容足够长
break
# 如果没找到特定内容区域,使用整个页面
if not content or len(content) < 100:
content = self.extract_text_content(soup)
return content
def extract_publish_date(self, soup):
"""提取发布时间"""
date_selectors = [
'.publish-time',
'.pub-time',
'.date',
'.time',
'.publish-date',
'time[datetime]',
'.article-time',
'.news-time',
'.post-time',
'.create-time',
'.update-time',
'.time span',
'.date span',
'.info span', # 一些网站使用.info类包含发布信息
'.meta span',
'.meta-info',
'.article-info span',
'.news-info span',
'.content-info span',
'.a-shijian', # 上海纪检监察网站的发布时间类
'.l-time' # 天津纪检监察网站的发布时间类
]
for selector in date_selectors:
elements = soup.select(selector)
for element in elements:
date_text = element.get_text().strip()
if element.get('datetime'):
date_text = element.get('datetime')
# 如果文本太短或为空,跳过
if not date_text or len(date_text) < 4:
continue
# 尝试解析日期
try:
from datetime import datetime
import re
# 清理日期文本,移除常见的无关字符
date_text = re.sub(r'发布(时间|日期)[:]?', '', date_text).strip()
date_text = re.sub(r'时间[:]?', '', date_text).strip()
date_text = re.sub(r'日期[:]?', '', date_text).strip()
date_text = re.sub(r'发表于[:]?', '', date_text).strip()
date_text = re.sub(r'更新[:]?', '', date_text).strip()
date_text = re.sub(r'\s+', ' ', date_text).strip() # 替换多个空白字符为单个空格
# 如果有 datetime 属性且是标准格式,直接使用
if element.get('datetime'):
datetime_attr = element.get('datetime')
# 尝试解析常见的日期时间格式
for fmt in [
'%Y-%m-%d %H:%M:%S',
'%Y-%m-%dT%H:%M:%S',
'%Y-%m-%dT%H:%M:%S%z',
'%Y-%m-%d %H:%M',
'%Y-%m-%d',
'%Y/%m/%d %H:%M:%S',
'%Y/%m/%d %H:%M',
'%Y/%m/%d',
'%Y年%m月%d%H:%M:%S',
'%Y年%m月%d%H:%M',
'%Y年%m月%d',
'%m/%d/%Y %H:%M:%S',
'%m/%d/%Y %H:%M',
'%m/%d/%Y',
'%d/%m/%Y %H:%M:%S',
'%d/%m/%Y %H:%M',
'%d/%m/%Y',
'%d.%m.%Y %H:%M:%S',
'%d.%m.%Y %H:%M',
'%d.%m.%Y'
]:
try:
if '%z' in fmt and '+' not in datetime_attr and datetime_attr.endswith('Z'):
datetime_attr = datetime_attr[:-1] + '+0000'
parsed_date = datetime.strptime(datetime_attr, fmt)
if not timezone.is_aware(parsed_date):
parsed_date = timezone.make_aware(parsed_date)
return parsed_date
except ValueError:
continue
# 尝试解析从文本中提取的日期
# 尝试解析各种常见的中文日期格式
for fmt in [
'%Y年%m月%d%H:%M:%S',
'%Y年%m月%d%H:%M',
'%Y年%m月%d',
'%Y-%m-%d %H:%M:%S',
'%Y-%m-%d %H:%M',
'%Y-%m-%d',
'%Y/%m/%d %H:%M:%S',
'%Y/%m/%d %H:%M',
'%Y/%m/%d',
'%m月%d%H:%M',
'%m月%d',
'%m/%d/%Y %H:%M:%S',
'%m/%d/%Y %H:%M',
'%m/%d/%Y',
'%d/%m/%Y %H:%M:%S',
'%d/%m/%Y %H:%M',
'%d/%m/%Y',
'%d.%m.%Y %H:%M:%S',
'%d.%m.%Y %H:%M',
'%d.%m.%Y'
]:
try:
parsed_date = datetime.strptime(date_text, fmt)
# 如果没有年份,使用当前年份
if '%Y' not in fmt:
parsed_date = parsed_date.replace(year=datetime.now().year)
if not timezone.is_aware(parsed_date):
parsed_date = timezone.make_aware(parsed_date)
return parsed_date
except ValueError:
continue
# 如果以上格式都不匹配,尝试使用 dateutil 解析
try:
from dateutil import parser
# 过滤掉明显不是日期的文本
if len(date_text) > 5 and not date_text.isdigit():
parsed_date = parser.parse(date_text)
if not timezone.is_aware(parsed_date):
parsed_date = timezone.make_aware(parsed_date)
return parsed_date
except:
pass
except Exception as e:
self.log('debug', f'解析日期失败: {date_text}, 错误: {str(e)}')
continue
return None
def extract_author(self, soup):
"""提取作者信息"""
author_selectors = [
'.author',
'.writer',
'.publisher',
'.byline',
'.article-author',
'.news-author'
]
for selector in author_selectors:
element = soup.select_one(selector)
if element:
return element.get_text().strip()
return ""
def download_media_file(self, media_url, crawled_content, media_type='image', alt_text=''):
"""下载媒体文件"""
try:
# 检查URL是否有效
if not media_url or not media_url.startswith(('http://', 'https://')):
return None
# 请求媒体文件
response = self.session.get(
media_url,
timeout=self.timeout,
verify=False,
stream=False # 改为False以确保获取完整内容
)
response.raise_for_status()
# 获取文件信息
content_type = response.headers.get('content-type', '')
content_length = response.headers.get('content-length')
file_size = int(content_length) if content_length else len(response.content)
# 确定文件扩展名
file_extension = self.get_file_extension_from_url(media_url, content_type)
# 生成文件名
filename = f"media_{crawled_content.id}_{len(crawled_content.media_files.all())}{file_extension}"
# 创建媒体文件对象
media_file = MediaFile.objects.create(
content=crawled_content,
media_type=media_type,
original_url=media_url,
file_size=file_size,
mime_type=content_type,
alt_text=alt_text
)
# 保存文件
media_file.local_file.save(
filename,
ContentFile(response.content),
save=True
)
self.log('info', f'媒体文件已下载: {filename} ({media_type})', crawled_content.website)
return media_file
except Exception as e:
self.log('error', f'下载媒体文件失败 {media_url}: {str(e)}', crawled_content.website)
return None
def get_file_extension_from_url(self, url, content_type):
"""从URL或内容类型获取文件扩展名"""
# 从URL获取扩展名
parsed_url = urlparse(url)
path = parsed_url.path
if '.' in path:
return os.path.splitext(path)[1]
# 从内容类型获取扩展名
content_type_map = {
'image/jpeg': '.jpg',
'image/jpg': '.jpg',
'image/png': '.png',
'image/gif': '.gif',
'image/webp': '.webp',
'image/svg+xml': '.svg',
'video/mp4': '.mp4',
'video/avi': '.avi',
'video/mov': '.mov',
'video/wmv': '.wmv',
'video/flv': '.flv',
'video/webm': '.webm',
'audio/mp3': '.mp3',
'audio/wav': '.wav',
'audio/ogg': '.ogg',
'application/pdf': '.pdf',
'application/msword': '.doc',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
}
return content_type_map.get(content_type.lower(), '.bin')
def extract_and_download_media(self, soup, crawled_content, base_url):
"""提取并下载页面中的媒体文件"""
media_files = []
# 提取图片
images = soup.find_all('img')
self.log('info', f'找到 {len(images)} 个图片标签', crawled_content.website)
for img in images:
src = img.get('src')
if src:
# 处理相对URL
if src.startswith('//'):
src = 'https:' + src
elif src.startswith('/'):
src = urljoin(base_url, src)
elif not src.startswith(('http://', 'https://')):
src = urljoin(base_url, src)
alt_text = img.get('alt', '')
self.log('info', f'尝试下载图片: {src}', crawled_content.website)
media_file = self.download_media_file(src, crawled_content, 'image', alt_text)
if media_file:
media_files.append(media_file)
self.log('info', f'成功下载图片: {media_file.local_file.name}', crawled_content.website)
# 提取视频
videos = soup.find_all(['video', 'source'])
for video in videos:
src = video.get('src')
if src:
# 处理相对URL
if src.startswith('//'):
src = 'https:' + src
elif src.startswith('/'):
src = urljoin(base_url, src)
elif not src.startswith(('http://', 'https://')):
src = urljoin(base_url, src)
media_file = self.download_media_file(src, crawled_content, 'video')
if media_file:
media_files.append(media_file)
# 提取音频
audios = soup.find_all('audio')
for audio in audios:
src = audio.get('src')
if src:
# 处理相对URL
if src.startswith('//'):
src = 'https:' + src
elif src.startswith('/'):
src = urljoin(base_url, src)
elif not src.startswith(('http://', 'https://')):
src = urljoin(base_url, src)
media_file = self.download_media_file(src, crawled_content, 'audio')
if media_file:
media_files.append(media_file)
return media_files
def mark_content_saved(self, crawled_content):
"""标记内容已保存(内容已存储在数据库中)"""
try:
crawled_content.is_local_saved = True
crawled_content.save()
media_count = crawled_content.media_files.count()
self.log('info', f'文章内容已保存到数据库 (包含 {media_count} 个媒体文件)', crawled_content.website)
return True
except Exception as e:
self.log('error', f'标记内容保存状态失败: {str(e)}', crawled_content.website)
return False
def crawl_website(self, website):
"""爬取单个网站"""
self.log('info', f'开始爬取网站: {website.name}', website)
try:
# 请求主页
response = self.session.get(
website.url,
timeout=self.timeout,
verify=False # 忽略SSL证书验证
)
response.raise_for_status()
# 检查内容编码
if response.encoding != 'utf-8':
# 尝试从响应头获取编码
content_type = response.headers.get('content-type', '')
if 'charset=' in content_type:
charset = content_type.split('charset=')[-1]
response.encoding = charset
else:
response.encoding = 'utf-8'
soup = BeautifulSoup(response.content, 'html.parser')
# 查找文章链接
article_links = self.find_article_links(soup, website.url)
self.log('info', f'找到 {len(article_links)} 个文章链接', website)
crawled_count = 0
for link_info in article_links:
try:
# 请求文章页面
article_response = self.session.get(
link_info['url'],
timeout=self.timeout,
verify=False # 忽略SSL证书验证
)
article_response.raise_for_status()
# 检查内容编码
if article_response.encoding != 'utf-8':
# 尝试从响应头获取编码
content_type = article_response.headers.get('content-type', '')
if 'charset=' in content_type:
charset = content_type.split('charset=')[-1]
article_response.encoding = charset
else:
article_response.encoding = 'utf-8'
article_soup = BeautifulSoup(article_response.content, 'html.parser')
# 提取内容
content = self.extract_article_content(link_info['url'], article_soup)
title = link_info['title']
# 检查关键字匹配
matched_keywords = self.check_keyword_match(content, title)
if matched_keywords:
# 提取其他信息
publish_date = self.extract_publish_date(article_soup)
author = self.extract_author(article_soup)
# 检查是否已存在相同URL的文章
existing_content = CrawledContent.objects.filter(
url=link_info['url'],
task=self.task
).first()
if existing_content:
# 如果已存在,更新现有记录而不是创建新记录
existing_content.title = title
existing_content.content = content
existing_content.publish_date = publish_date
existing_content.author = author
existing_content.keywords_matched = ','.join(matched_keywords)
existing_content.save()
# 更新媒体文件
# 先删除旧的媒体文件
existing_content.media_files.all().delete()
# 然后重新下载媒体文件
media_files = self.extract_and_download_media(article_soup, existing_content, link_info['url'])
self.log('info', f'更新已存在的文章: {title[:50]}...', website)
else:
# 保存新内容
crawled_content = CrawledContent.objects.create(
task=self.task,
website=website,
title=title,
content=content,
url=link_info['url'],
publish_date=publish_date,
author=author,
keywords_matched=','.join(matched_keywords),
is_local_saved=False # 初始设置为False保存到本地后会更新为True
)
# 提取并下载媒体文件
media_files = self.extract_and_download_media(article_soup, crawled_content, link_info['url'])
# 标记内容已保存
self.mark_content_saved(crawled_content)
self.log('info', f'保存新文章: {title[:50]}...', website)
crawled_count += 1
# 请求间隔
time.sleep(settings.CRAWLER_SETTINGS['REQUEST_DELAY'])
except requests.exceptions.SSLError as e:
self.log('error', f'SSL错误跳过文章 {link_info["url"]}: {str(e)}', website)
continue
except requests.exceptions.ConnectionError as e:
self.log('error', f'连接错误,跳过文章 {link_info["url"]}: {str(e)}', website)
continue
except requests.exceptions.Timeout as e:
self.log('error', f'请求超时,跳过文章 {link_info["url"]}: {str(e)}', website)
continue
except requests.exceptions.RequestException as e:
self.log('error', f'网络请求错误,跳过文章 {link_info["url"]}: {str(e)}', website)
continue
except UnicodeDecodeError as e:
self.log('error', f'字符编码错误,跳过文章 {link_info["url"]}: {str(e)}', website)
continue
except Exception as e:
self.log('error', f'处理文章失败 {link_info["url"]}: {str(e)}', website)
continue
self.log('info', f'网站爬取完成,共保存 {crawled_count} 篇文章', website)
return crawled_count
except requests.exceptions.SSLError as e:
self.log('error', f'爬取网站SSL错误: {str(e)}', website)
return 0
except requests.exceptions.ConnectionError as e:
self.log('error', f'爬取网站连接错误: {str(e)}', website)
return 0
except requests.exceptions.Timeout as e:
self.log('error', f'爬取网站超时: {str(e)}', website)
return 0
except requests.exceptions.RequestException as e:
self.log('error', f'爬取网站网络错误: {str(e)}', website)
return 0
except Exception as e:
self.log('error', f'爬取网站失败: {str(e)}', website)
return 0
def run(self):
"""运行爬取任务"""
self.log('info', f'开始执行爬取任务: {self.task.name}')
self.update_task_status('running')
total_crawled = 0
websites = self.task.websites.filter(is_active=True)
self.task.total_pages = websites.count()
self.task.save()
for website in websites:
try:
crawled_count = self.crawl_website(website)
total_crawled += crawled_count
self.task.crawled_pages += 1
self.task.save()
except Exception as e:
self.log('error', f'爬取网站 {website.name} 时发生错误: {str(e)}', website)
continue
# 更新任务状态
if total_crawled > 0:
self.update_task_status('completed')
self.log('info', f'爬取任务完成,共爬取 {total_crawled} 篇文章')
else:
self.update_task_status('failed', error_message='没有找到匹配的内容')
self.log('error', '爬取任务失败,没有找到匹配的内容')
def run_crawl_task(task_id):
"""运行爬取任务Celery任务"""
try:
crawler = WebsiteCrawler(task_id)
crawler.run()
return f"任务 {task_id} 执行完成"
except Exception as e:
# 记录异常到日志
logger.error(f"执行任务 {task_id} 时发生异常: {str(e)}", exc_info=True)
task = CrawlTask.objects.get(id=task_id)
task.status = 'failed'
task.error_message = str(e)
task.completed_at = timezone.now()
task.save()
CrawlLog.objects.create(
task=task,
level='error',
message=f'任务执行失败: {str(e)}'
)
return f"任务 {task_id} 执行失败: {str(e)}"

139
docker-compose.yml Normal file
View File

@@ -0,0 +1,139 @@
version: '3.8'
services:
# PostgreSQL数据库
db:
image: postgres:15
environment:
POSTGRES_DB: green_classroom
POSTGRES_USER: postgres
POSTGRES_PASSWORD: postgres
volumes:
- postgres_data:/var/lib/postgresql/data
ports:
- "5432:5432"
healthcheck:
test: ["CMD-SHELL", "pg_isready -U postgres"]
interval: 30s
timeout: 10s
retries: 3
# Redis缓存和消息队列
redis:
image: redis:7-alpine
ports:
- "6379:6379"
volumes:
- redis_data:/data
healthcheck:
test: ["CMD", "redis-cli", "ping"]
interval: 30s
timeout: 10s
retries: 3
# Django Web应用
web:
build: .
command: runserver
environment:
- DEBUG=False
- DB_ENGINE=django.db.backends.postgresql
- DB_NAME=green_classroom
- DB_USER=postgres
- DB_PASSWORD=postgres
- DB_HOST=db
- DB_PORT=5432
- REDIS_URL=redis://redis:6379/0
- CELERY_BROKER_URL=redis://redis:6379/0
- CELERY_RESULT_BACKEND=redis://redis:6379/0
- SECRET_KEY=your-production-secret-key-here
- ALLOWED_HOSTS=localhost,127.0.0.1
volumes:
- ./date/media:/app/date/media
- ./logs:/app/logs
ports:
- "8000:8000"
depends_on:
db:
condition: service_healthy
redis:
condition: service_healthy
restart: unless-stopped
# Celery Worker
celery:
build: .
command: celery
environment:
- DEBUG=False
- DB_ENGINE=django.db.backends.postgresql
- DB_NAME=green_classroom
- DB_USER=postgres
- DB_PASSWORD=postgres
- DB_HOST=db
- DB_PORT=5432
- REDIS_URL=redis://redis:6379/0
- CELERY_BROKER_URL=redis://redis:6379/0
- CELERY_RESULT_BACKEND=redis://redis:6379/0
- SECRET_KEY=your-production-secret-key-here
volumes:
- ./date/media:/app/date/media
- ./logs:/app/logs
depends_on:
db:
condition: service_healthy
redis:
condition: service_healthy
restart: unless-stopped
# Celery Beat (定时任务)
celery-beat:
build: .
command: celery-beat
environment:
- DEBUG=False
- DB_ENGINE=django.db.backends.postgresql
- DB_NAME=green_classroom
- DB_USER=postgres
- DB_PASSWORD=postgres
- DB_HOST=db
- DB_PORT=5432
- REDIS_URL=redis://redis:6379/0
- CELERY_BROKER_URL=redis://redis:6379/0
- CELERY_RESULT_BACKEND=redis://redis:6379/0
- SECRET_KEY=your-production-secret-key-here
volumes:
- ./date/media:/app/date/media
- ./logs:/app/logs
depends_on:
db:
condition: service_healthy
redis:
condition: service_healthy
restart: unless-stopped
# Flower (Celery监控)
flower:
build: .
command: flower
environment:
- DEBUG=False
- DB_ENGINE=django.db.backends.postgresql
- DB_NAME=green_classroom
- DB_USER=postgres
- DB_PASSWORD=postgres
- DB_HOST=db
- DB_PORT=5432
- REDIS_URL=redis://redis:6379/0
- CELERY_BROKER_URL=redis://redis:6379/0
- CELERY_RESULT_BACKEND=redis://redis:6379/0
- SECRET_KEY=your-production-secret-key-here
ports:
- "5555:5555"
depends_on:
- redis
restart: unless-stopped
volumes:
postgres_data:
redis_data:

49
green_classroom/celery.py Normal file
View File

@@ -0,0 +1,49 @@
import os
from celery import Celery
from django.conf import settings
# 设置默认Django设置模块
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'green_classroom.settings')
app = Celery('green_classroom')
# 使用Django的设置文件
app.config_from_object('django.conf:settings', namespace='CELERY')
# 自动发现任务
app.autodiscover_tasks(lambda: settings.INSTALLED_APPS)
# 配置任务路由
app.conf.task_routes = {
'core.tasks.*': {'queue': 'crawler'},
'core.tasks.crawl_website': {'queue': 'crawler'},
'core.tasks.crawl_all_websites': {'queue': 'crawler'},
}
# 配置任务序列化
app.conf.task_serializer = 'json'
app.conf.result_serializer = 'json'
app.conf.accept_content = ['json']
# 配置时区
app.conf.timezone = settings.TIME_ZONE
# 配置任务执行时间限制
app.conf.task_time_limit = 30 * 60 # 30分钟
app.conf.task_soft_time_limit = 25 * 60 # 25分钟
# 配置重试策略
app.conf.task_acks_late = True
app.conf.task_reject_on_worker_lost = True
# 配置结果后端
app.conf.result_backend = settings.CELERY_RESULT_BACKEND
# 配置工作进程
app.conf.worker_prefetch_multiplier = 1
app.conf.worker_max_tasks_per_child = 1000
@app.task(bind=True)
def debug_task(self):
print(f'Request: {self.request!r}')

View File

@@ -10,7 +10,12 @@ For the full list of settings and their values, see
https://docs.djangoproject.com/en/5.1/ref/settings/
"""
import os
from pathlib import Path
from dotenv import load_dotenv
# 加载环境变量
load_dotenv()
# Build paths inside the project like this: BASE_DIR / 'subdir'.
BASE_DIR = Path(__file__).resolve().parent.parent
@@ -19,12 +24,14 @@ BASE_DIR = Path(__file__).resolve().parent.parent
# See https://docs.djangoproject.com/en/5.1/howto/deployment/checklist/
# SECURITY WARNING: keep the secret key used in production secret!
SECRET_KEY = 'django-insecure-_kr!&5j#i!)lo(=u-&5ni+21cwxcq)j-35k!ne20)fyx!u6dnl'
SECRET_KEY = os.getenv('SECRET_KEY', 'django-insecure-_kr!&5j#i!)lo(=u-&5ni+21cwxcq)j-35k!ne20)fyx!u6dnl')
# SECURITY WARNING: don't run with debug turned on in production!
DEBUG = True
DEBUG = os.getenv('DEBUG', 'True').lower() == 'true'
ALLOWED_HOSTS = []
ALLOWED_HOSTS = os.getenv('ALLOWED_HOSTS', 'localhost,127.0.0.1,192.168.9.108,green.yuangyaa.com').split(',')
CSRF_TRUSTED_ORIGINS = os.getenv('CSRF_TRUSTED_ORIGINS', 'https://green.yuangyaa.com').split(',')
# Application definition
@@ -36,8 +43,15 @@ INSTALLED_APPS = [
'django.contrib.messages',
'django.contrib.staticfiles',
'core',
'django_celery_beat',
'django_celery_results',
'rest_framework',
'rest_framework.authtoken',
]
# 导入Admin扩展
# import core.admin_extended # 暂时注释,避免循环导入
MIDDLEWARE = [
'django.middleware.security.SecurityMiddleware',
'django.contrib.sessions.middleware.SessionMiddleware',
@@ -71,12 +85,30 @@ WSGI_APPLICATION = 'green_classroom.wsgi.application'
# Database
# https://docs.djangoproject.com/en/5.1/ref/settings/#databases
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.sqlite3',
'NAME': BASE_DIR / 'db.sqlite3',
# 根据环境变量选择数据库
DB_ENGINE = os.getenv('DB_ENGINE', 'django.db.backends.sqlite3')
if DB_ENGINE == 'django.db.backends.postgresql':
DATABASES = {
'default': {
'ENGINE': DB_ENGINE,
'NAME': os.getenv('DB_NAME', 'green_classroom'),
'USER': os.getenv('DB_USER', 'postgres'),
'PASSWORD': os.getenv('DB_PASSWORD', ''),
'HOST': os.getenv('DB_HOST', 'localhost'),
'PORT': os.getenv('DB_PORT', '5432'),
'OPTIONS': {
'charset': 'utf8mb4',
},
}
}
else:
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.sqlite3',
'NAME': BASE_DIR / 'db.sqlite3',
}
}
}
# Password validation
# https://docs.djangoproject.com/en/5.1/ref/settings/#auth-password-validators
@@ -110,17 +142,122 @@ USE_TZ = True
# Static files (CSS, JavaScript, Images)
# https://docs.djangoproject.com/en/5.1/howto/static-files/
STATIC_URL = 'static/'
STATIC_URL = '/static/'
STATIC_ROOT = os.getenv('STATIC_ROOT', os.path.join(BASE_DIR, 'data', 'static'))
# Default primary key field type
# https://docs.djangoproject.com/en/5.1/ref/settings/#default-auto-field
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
import os
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
MEDIA_ROOT = os.path.join(BASE_DIR, 'date', 'media')
# 媒体文件配置
MEDIA_ROOT = os.getenv('MEDIA_ROOT', os.path.join(BASE_DIR, 'data', 'media'))
MEDIA_URL = '/media/'
# Celery配置
CELERY_BROKER_URL = os.getenv('CELERY_BROKER_URL', 'redis://127.0.0.1:6379/0')
CELERY_RESULT_BACKEND = os.getenv('CELERY_RESULT_BACKEND', 'redis://127.0.0.1:6379/0')
CELERY_ACCEPT_CONTENT = ['json']
CELERY_TASK_SERIALIZER = 'json'
CELERY_RESULT_SERIALIZER = 'json'
CELERY_TIMEZONE = TIME_ZONE
CELERY_TASK_TRACK_STARTED = True
CELERY_TASK_TIME_LIMIT = 30 * 60 # 30分钟
# Redis配置
REDIS_URL = os.getenv('REDIS_URL', 'redis://127.0.0.1:6379/0')
# 日志配置
LOGGING = {
'version': 1,
'disable_existing_loggers': False,
'formatters': {
'verbose': {
'format': '{levelname} {asctime} {module} {process:d} {thread:d} {message}',
'style': '{',
},
'simple': {
'format': '{levelname} {message}',
'style': '{',
},
},
'handlers': {
'file': {
'level': os.getenv('LOG_LEVEL', 'INFO'),
'class': 'logging.FileHandler',
'filename': os.getenv('LOG_FILE', os.path.join(BASE_DIR, 'data', 'logs', 'django.log')),
'formatter': 'verbose',
},
'console': {
'level': os.getenv('LOG_LEVEL', 'INFO'),
'class': 'logging.StreamHandler',
'formatter': 'simple',
},
},
'root': {
'handlers': ['console', 'file'],
'level': os.getenv('LOG_LEVEL', 'INFO'),
},
'loggers': {
'django': {
'handlers': ['console', 'file'],
'level': os.getenv('LOG_LEVEL', 'INFO'),
'propagate': False,
},
'core': {
'handlers': ['console', 'file'],
'level': os.getenv('LOG_LEVEL', 'INFO'),
'propagate': False,
},
},
}
# 安全设置
if not DEBUG:
SECURE_BROWSER_XSS_FILTER = True
SECURE_CONTENT_TYPE_NOSNIFF = True
X_FRAME_OPTIONS = 'DENY'
SECURE_HSTS_SECONDS = 31536000
SECURE_HSTS_INCLUDE_SUBDOMAINS = True
SECURE_HSTS_PRELOAD = True
# 爬虫设置
CRAWLER_TIMEOUT = int(os.getenv('CRAWLER_TIMEOUT', 30))
CRAWLER_MAX_RETRIES = int(os.getenv('CRAWLER_MAX_RETRIES', 3))
CRAWLER_DELAY = int(os.getenv('CRAWLER_DELAY', 1))
# Selenium设置
SELENIUM_HEADLESS = os.getenv('SELENIUM_HEADLESS', 'True').lower() == 'true'
CHROME_DRIVER_PATH = os.getenv('CHROME_DRIVER_PATH', '/usr/bin/chromedriver')
# Sentry监控可选
SENTRY_DSN = os.getenv('SENTRY_DSN')
if SENTRY_DSN:
import sentry_sdk
from sentry_sdk.integrations.django import DjangoIntegration
sentry_sdk.init(
dsn=SENTRY_DSN,
integrations=[DjangoIntegration()],
traces_sample_rate=1.0,
send_default_pii=True
)
# Django REST Framework 配置
REST_FRAMEWORK = {
'DEFAULT_RENDERER_CLASSES': [
'rest_framework.renderers.JSONRenderer',
'rest_framework.renderers.BrowsableAPIRenderer',
],
'DEFAULT_PERMISSION_CLASSES': [
'rest_framework.permissions.IsAuthenticated',
],
'DEFAULT_AUTHENTICATION_CLASSES': [
'rest_framework.authentication.SessionAuthentication',
'rest_framework.authentication.TokenAuthentication',
],
}
DATA_UPLOAD_MAX_NUMBER_FIELDS = 10240

View File

@@ -1,7 +1,10 @@
from django.contrib import admin
from django.urls import path, include
from django.conf import settings
from django.conf.urls.static import static
from django.contrib import admin
from django.urls import path, include
# 需要导入自定义的管理站点实例
urlpatterns = [
path('admin/', admin.site.urls),

83
requirements.txt Normal file
View File

@@ -0,0 +1,83 @@
amqp==5.3.1
asgiref==3.9.1
asttokens==3.0.0
attrs==25.3.0
beautifulsoup4==4.13.4
billiard==4.2.1
bs4==0.0.2
celery==5.5.3
certifi==2025.8.3
charset-normalizer==3.4.3
click==8.2.1
click-didyoumean==0.3.1
click-plugins==1.1.1.2
click-repl==0.3.0
coverage==7.10.3
cron-descriptor==1.4.5
decorator==5.2.1
Django==5.1
django-celery-beat==2.8.1
django-db-connection-pool==1.2.6
django-timezone-field==7.1
django_celery_results==2.6.0
djangorestframework==3.16.1
executing==2.2.0
factory_boy==3.3.3
Faker==37.5.3
greenlet==3.2.4
gunicorn==23.0.0
h11==0.16.0
idna==3.10
iniconfig==2.1.0
ipython==9.4.0
ipython_pygments_lexers==1.1.1
jedi==0.19.2
kombu==5.5.4
lxml==6.0.0
m3u8==6.0.0
matplotlib-inline==0.1.7
outcome==1.3.0.post0
packaging==25.0
parso==0.8.4
pexpect==4.9.0
pluggy==1.6.0
prompt_toolkit==3.0.51
psycopg2-binary==2.9.10
ptyprocess==0.7.0
pure_eval==0.2.3
pycryptodome==3.23.0
Pygments==2.19.2
PySocks==1.7.1
pytest==8.4.1
pytest-cov==6.2.1
pytest-django==4.11.1
python-crontab==3.3.0
python-dateutil==2.9.0.post0
python-docx==1.2.0
python-dotenv==1.1.1
redis==6.4.0
requests==2.32.4
selenium==4.34.2
sentry-sdk==2.35.0
six==1.17.0
sniffio==1.3.1
sortedcontainers==2.4.0
soupsieve==2.7
SQLAlchemy==2.0.43
sqlparams==6.2.0
sqlparse==0.5.3
stack-data==0.6.3
tqdm==4.67.1
traitlets==5.14.3
trio==0.30.0
trio-websocket==0.12.2
typing_extensions==4.14.1
tzdata==2025.2
urllib3==2.5.0
uv==0.8.8
uvicorn==0.35.0
vine==5.1.0
wcwidth==0.2.13
webdriver-manager==4.0.2
websocket-client==1.8.0
wsproto==1.2.0

122
test_crawlers.py Normal file
View File

@@ -0,0 +1,122 @@
#!/usr/bin/env python
"""
测试爬虫命令的脚本
用于验证所有爬虫命令是否正常工作
"""
import os
import sys
import django
from django.core.management import call_command
from django.test.utils import get_runner
from django.conf import settings
# 设置Django环境
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'green_classroom.settings')
django.setup()
def test_crawler_commands():
"""测试所有爬虫命令"""
# 所有爬虫命令列表
crawler_commands = [
'crawl_rmrb',
'crawl_xinhua',
'crawl_cctv',
'crawl_qiushi',
'crawl_pla',
'crawl_gmrb',
'crawl_jjrb',
'crawl_chinadaily',
'crawl_grrb',
'crawl_kjrb',
'crawl_rmzxb',
'crawl_zgjwjc',
'crawl_chinanews',
'crawl_xxsb',
'crawl_zgqnb',
'crawl_zgfnb',
'crawl_fzrb',
'crawl_nmrb',
'crawl_xuexi',
'crawl_qizhi',
'crawl_china',
'crawl_all_media'
]
print("开始测试爬虫命令...")
print("=" * 50)
for command in crawler_commands:
try:
print(f"测试命令: {command}")
# 只测试命令是否存在,不实际执行爬取
# 这里可以添加实际的测试逻辑
print(f"{command} 命令可用")
except Exception as e:
print(f"{command} 命令测试失败: {e}")
print("=" * 50)
print("爬虫命令测试完成")
def test_export_command():
"""测试导出命令"""
try:
print("测试导出命令...")
# 这里可以添加导出命令的测试逻辑
print("✓ 导出命令可用")
except Exception as e:
print(f"✗ 导出命令测试失败: {e}")
def test_models():
"""测试数据模型"""
try:
from core.models import Website, Article
print("测试数据模型...")
# 测试创建网站对象
website, created = Website.objects.get_or_create(
name="测试网站",
defaults={
'base_url': 'https://test.com',
'article_list_url': 'https://test.com',
'article_selector': 'a'
}
)
print(f"✓ 网站模型测试通过: {website.name}")
# 清理测试数据
if created:
website.delete()
except Exception as e:
print(f"✗ 数据模型测试失败: {e}")
def main():
"""主函数"""
print("中央主流媒体爬虫系统测试")
print("=" * 50)
# 测试数据模型
test_models()
print()
# 测试爬虫命令
test_crawler_commands()
print()
# 测试导出命令
test_export_command()
print()
print("所有测试完成!")
print("=" * 50)
print("使用方法:")
print("1. 单个媒体爬取: python manage.py crawl_rmrb")
print("2. 批量爬取: python manage.py crawl_all_media")
print("3. 导出数据: python manage.py export_articles --format json")
print("4. 查看帮助: python manage.py help")
if __name__ == '__main__':
main()