fix bugs and support all platform

This commit is contained in:
2025-08-15 08:33:47 +08:00
parent e82b85f4dd
commit 4945b4c6b0
36 changed files with 2296 additions and 992 deletions

6
.gitignore vendored
View File

@@ -180,5 +180,11 @@ cython_debug/
#
#####################################
# 数据目录
data/
date/media/
# 配置文件
config/
.env

View File

@@ -1,517 +0,0 @@
from .models import Website, Article
# 添加actions相关的导入
from django.contrib import messages
# 添加导出功能所需导入
import csv
from django.http import HttpResponse
import json
# 添加视图函数需要的导入
from django.shortcuts import render, redirect
from django.urls import path
from django.contrib import admin
from django.core.management import call_command
# 添加运行爬虫的视图函数
def run_crawler_view(request):
"""
管理后台运行爬虫的视图
"""
if request.method == 'POST':
website_name = request.POST.get('website_name')
if not website_name:
messages.error(request, '请选择要爬取的网站')
return redirect('admin:core_article_changelist')
try:
# 动态获取网站对象
website = Website.objects.get(name=website_name)
# 根据网站对象确定要执行的爬虫命令
# 移除默认的通用爬虫,每个网站必须配置自己的爬虫命令
crawler_name = getattr(website, 'crawler_command', None)
# 如果网站没有配置爬虫命令,则报错
if not crawler_name:
messages.error(request, f'网站 {website_name} 未配置爬虫命令')
return redirect('admin:core_article_changelist')
# 运行爬虫命令,传递网站名称
call_command(crawler_name, website_name)
messages.success(request, f'成功执行爬虫: {website_name}')
except Website.DoesNotExist:
messages.error(request, f'网站不存在: {website_name}')
except Exception as e:
messages.error(request, f'执行爬虫失败: {str(e)}')
return redirect('admin:core_article_changelist')
@admin.register(Website)
class WebsiteAdmin(admin.ModelAdmin):
list_display = ('name', 'base_url', 'enabled')
# 为ArticleAdmin添加自定义动作
@admin.register(Article)
class ArticleAdmin(admin.ModelAdmin):
list_display = ('title', 'website', 'pub_date')
search_fields = ('title', 'content')
# 添加动作选项
actions = ['delete_selected_articles', 'export_as_csv', 'export_as_json',
'export_as_word', 'export_with_media']
def get_websites(self):
"""获取所有启用的网站"""
return Website.objects.filter(enabled=True)
# 重写get_urls方法添加自定义URL
def get_urls(self):
urls = super().get_urls()
custom_urls = [
path('run-crawler/', self.admin_site.admin_view(run_crawler_view), name='run_crawler'),
]
return custom_urls + urls
def export_as_csv(self, request, queryset):
"""导出选中的文章为CSV格式"""
meta = self.model._meta
field_names = [field.name for field in meta.fields]
response = HttpResponse(content_type='text/csv')
response['Content-Disposition'] = 'attachment; filename={}.csv'.format(meta)
writer = csv.writer(response)
writer.writerow(field_names)
for obj in queryset:
row = [getattr(obj, field)() if callable(getattr(obj, field)) else getattr(obj, field) for field in
field_names]
writer.writerow(row)
return response
export_as_csv.short_description = "导出选中文章为CSV格式"
def export_as_json(self, request, queryset):
"""导出选中的文章为JSON格式"""
response = HttpResponse(content_type='application/json')
response['Content-Disposition'] = 'attachment; filename=articles.json'
# 构造要导出的数据
articles_data = []
for article in queryset:
articles_data.append({
'id': article.id,
'title': article.title,
'website': article.website.name,
'url': article.url,
'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None,
'content': article.content,
'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'),
'media_files': article.media_files
})
# 写入JSON数据
response.write(json.dumps(articles_data, ensure_ascii=False, indent=2))
return response
export_as_json.short_description = "导出选中文章为JSON格式"
def export_as_word(self, request, queryset):
"""导出选中的文章为Word格式"""
try:
from docx import Document
from io import BytesIO
from docx.shared import Inches
except ImportError:
self.message_user(request, "缺少python-docx库请安装: pip install python-docx", messages.ERROR)
return
# 创建Word文档
doc = Document()
doc.add_heading('文章导出', 0)
for article in queryset:
# 添加文章标题
doc.add_heading(article.title, level=1)
# 添加文章元数据
doc.add_paragraph(f"网站: {article.website.name}")
doc.add_paragraph(f"URL: {article.url}")
doc.add_paragraph(
f"发布时间: {article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else 'N/A'}")
doc.add_paragraph(f"创建时间: {article.created_at.strftime('%Y-%m-%d %H:%M:%S')}")
# 添加文章内容
doc.add_heading('内容', level=2)
# 简单处理HTML内容移除标签并处理图片
from bs4 import BeautifulSoup
soup = BeautifulSoup(article.content, 'html.parser')
# 处理内容中的图片
for img in soup.find_all('img'):
src = img.get('src', '')
if src:
# 尝试添加图片到文档
try:
import os
from django.conf import settings
import requests
from io import BytesIO
# 构建完整的图片路径
if src.startswith('http'):
# 网络图片
response = requests.get(src, timeout=10)
image_stream = BytesIO(response.content)
doc.add_picture(image_stream, width=Inches(4.0))
else:
# 本地图片
full_path = os.path.join(settings.MEDIA_ROOT, src.lstrip('/'))
if os.path.exists(full_path):
doc.add_picture(full_path, width=Inches(4.0))
except Exception as e:
# 如果添加图片失败添加图片URL作为文本
doc.add_paragraph(f"[图片: {src}]")
# 移除原始img标签
img.decompose()
content_text = soup.get_text()
doc.add_paragraph(content_text)
# 添加媒体文件信息
if article.media_files:
doc.add_heading('媒体文件', level=2)
for media_file in article.media_files:
try:
import os
from django.conf import settings
from io import BytesIO
import requests
full_path = os.path.join(settings.MEDIA_ROOT, media_file)
if os.path.exists(full_path):
# 添加图片到文档
doc.add_picture(full_path, width=Inches(4.0))
else:
# 如果是URL格式的媒体文件
if media_file.startswith('http'):
response = requests.get(media_file, timeout=10)
image_stream = BytesIO(response.content)
doc.add_picture(image_stream, width=Inches(4.0))
else:
doc.add_paragraph(media_file)
except Exception as e:
doc.add_paragraph(media_file)
# 添加分页符
doc.add_page_break()
# 保存到内存
buffer = BytesIO()
doc.save(buffer)
buffer.seek(0)
# 创建HttpResponse
from django.http import HttpResponse
response = HttpResponse(buffer.getvalue(),
content_type='application/vnd.openxmlformats-officedocument.wordprocessingml.document')
response['Content-Disposition'] = 'attachment; filename=articles.docx'
return response
export_as_word.short_description = "导出选中文章为Word格式"
def export_with_media(self, request, queryset):
"""导出选中的文章及媒体文件为ZIP包"""
try:
from docx import Document
from io import BytesIO
from docx.shared import Inches
import zipfile
except ImportError:
self.message_user(request, "缺少必要库,请安装: pip install python-docx", messages.ERROR)
return
# 创建内存中的ZIP文件
zip_buffer = BytesIO()
with zipfile.ZipFile(zip_buffer, 'w') as zip_file:
for article in queryset:
# 为每篇文章创建单独的文件夹
article_folder = f"article_{article.id}_{article.title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('*', '_').replace('?', '_').replace('"', '_').replace('<', '_').replace('>', '_').replace('|', '_')}"
# 创建Word文档
doc = Document()
doc.add_heading(article.title, 0)
# 添加文章元数据
doc.add_paragraph(f"网站: {article.website.name}")
doc.add_paragraph(f"URL: {article.url}")
doc.add_paragraph(
f"发布时间: {article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else 'N/A'}")
doc.add_paragraph(f"创建时间: {article.created_at.strftime('%Y-%m-%d %H:%M:%S')}")
# 添加文章内容
doc.add_heading('内容', level=2)
# 简单处理HTML内容移除标签并处理图片
from bs4 import BeautifulSoup
soup = BeautifulSoup(article.content, 'html.parser')
# 处理内容中的图片
for img in soup.find_all('img'):
src = img.get('src', '')
if src:
# 尝试添加图片到文档
try:
import os
from django.conf import settings
import requests
# 构建完整的图片路径
if src.startswith('http'):
# 网络图片
response = requests.get(src, timeout=10)
image_stream = BytesIO(response.content)
doc.add_picture(image_stream, width=Inches(4.0))
# 将网络文件保存到ZIP
zip_file.writestr(os.path.join(article_folder, 'media', os.path.basename(src)),
response.content)
else:
# 本地图片
full_path = os.path.join(settings.MEDIA_ROOT, src.lstrip('/'))
if os.path.exists(full_path):
doc.add_picture(full_path, width=Inches(4.0))
# 添加文件到ZIP包
zip_file.write(full_path, os.path.join(article_folder, 'media', src.lstrip('/')))
except Exception as e:
# 如果添加图片失败添加图片URL作为文本
doc.add_paragraph(f"[图片: {src}]")
# 移除原始img标签
img.decompose()
content_text = soup.get_text()
doc.add_paragraph(content_text)
# 添加媒体文件信息并打包媒体文件
if article.media_files:
doc.add_heading('媒体文件', level=2)
for media_file in article.media_files:
try:
import os
from django.conf import settings
full_path = os.path.join(settings.MEDIA_ROOT, media_file)
# 检查文件扩展名以确定处理方式
file_extension = os.path.splitext(media_file)[1].lower()
# 图片文件处理
if file_extension in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']:
if os.path.exists(full_path):
# 添加图片到文档
doc.add_picture(full_path, width=Inches(4.0))
# 添加文件到ZIP包
zip_file.write(full_path, os.path.join(article_folder, 'media', media_file))
else:
# 如果是URL格式的媒体文件
if media_file.startswith('http'):
response = requests.get(media_file, timeout=10)
image_stream = BytesIO(response.content)
doc.add_picture(image_stream, width=Inches(4.0))
# 将网络文件保存到ZIP
zip_file.writestr(
os.path.join(article_folder, 'media', os.path.basename(media_file)),
response.content)
else:
doc.add_paragraph(media_file)
# 视频文件处理
elif file_extension in ['.mp4', '.avi', '.mov', '.wmv', '.flv', '.webm']:
# 视频文件只添加到ZIP包中不在Word文档中显示
if os.path.exists(full_path):
# 添加文件到ZIP包
zip_file.write(full_path, os.path.join(article_folder, 'media', media_file))
# 在Word文档中添加视频文件信息
doc.add_paragraph(f"[视频文件: {media_file}]")
else:
# 如果是URL格式的媒体文件
if media_file.startswith('http'):
# 将网络文件保存到ZIP
response = requests.get(media_file, timeout=10)
zip_file.writestr(
os.path.join(article_folder, 'media', os.path.basename(media_file)),
response.content)
doc.add_paragraph(f"[视频文件: {media_file}]")
else:
doc.add_paragraph(media_file)
# 其他文件类型
else:
if os.path.exists(full_path):
# 添加文件到ZIP包
zip_file.write(full_path, os.path.join(article_folder, 'media', media_file))
doc.add_paragraph(f"[文件: {media_file}]")
else:
# 如果是URL格式的媒体文件
if media_file.startswith('http'):
response = requests.get(media_file, timeout=10)
zip_file.writestr(
os.path.join(article_folder, 'media', os.path.basename(media_file)),
response.content)
doc.add_paragraph(f"[文件: {media_file}]")
else:
doc.add_paragraph(media_file)
except Exception as e:
doc.add_paragraph(media_file)
# 保存每篇文章的Word文档到ZIP文件中的对应文件夹
doc_buffer = BytesIO()
doc.save(doc_buffer)
doc_buffer.seek(0)
zip_file.writestr(os.path.join(article_folder, f'{article.title.replace("/", "_")}.docx'),
doc_buffer.read())
# 创建HttpResponse
zip_buffer.seek(0)
from django.http import HttpResponse
response = HttpResponse(zip_buffer.getvalue(), content_type='application/zip')
response['Content-Disposition'] = 'attachment; filename=articles_export.zip'
return response
export_with_media.short_description = "导出选中文章及媒体文件(ZIP包)"
# 为不同网站创建专门的文章管理类
class NewsCnArticleAdmin(admin.ModelAdmin):
list_display = ('title', 'pub_date')
search_fields = ('title', 'content')
list_filter = ('pub_date',)
actions = ['export_as_csv', 'export_as_json']
def get_queryset(self, request):
qs = super().get_queryset(request)
# 只显示新华网的文章
return qs.filter(website__name='www.news.cn')
def export_as_csv(self, request, queryset):
"""导出选中的文章为CSV格式"""
meta = self.model._meta
field_names = [field.name for field in meta.fields if field.name != 'content'] # 排除content字段以减小CSV大小
response = HttpResponse(content_type='text/csv')
response['Content-Disposition'] = 'attachment; filename=news_cn_articles.csv'
writer = csv.writer(response)
writer.writerow(field_names)
for obj in queryset:
row = []
for field in field_names:
value = getattr(obj, field)
if callable(value):
value = value()
if field == 'website':
value = value.name
row.append(value)
writer.writerow(row)
return response
export_as_csv.short_description = "导出选中文章为CSV格式"
def export_as_json(self, request, queryset):
"""导出选中的文章为JSON格式"""
response = HttpResponse(content_type='application/json')
response['Content-Disposition'] = 'attachment; filename=news_cn_articles.json'
# 构造要导出的数据
articles_data = []
for article in queryset:
articles_data.append({
'id': article.id,
'title': article.title,
'website': article.website.name,
'url': article.url,
'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None,
'content': article.content,
'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'),
'media_files': article.media_files
})
# 写入JSON数据
response.write(json.dumps(articles_data, ensure_ascii=False, indent=2))
return response
export_as_json.short_description = "导出选中文章为JSON格式"
class DongfangyancaoArticleAdmin(admin.ModelAdmin):
list_display = ('title', 'pub_date')
search_fields = ('title', 'content')
list_filter = ('pub_date',)
# 添加动作选项
actions = ['delete_selected_articles', 'delete_all_articles', 'export_as_csv', 'export_as_json']
def get_queryset(self, request):
qs = super().get_queryset(request)
# 只显示东方烟草报的文章
return qs.filter(website__name='东方烟草报')
def delete_all_articles(self, request, queryset):
"""删除当前筛选的所有文章(东方烟草报的所有文章)"""
# 删除所有东方烟草报的文章
deleted_count = self.get_queryset(request).delete()[0]
self.message_user(request, f"成功删除 {deleted_count} 篇文章", messages.SUCCESS)
# 设置动作的显示名称
delete_all_articles.short_description = "删除所有当前筛选的文章"
def export_as_csv(self, request, queryset):
"""导出选中的文章为CSV格式"""
meta = self.model._meta
field_names = [field.name for field in meta.fields if field.name != 'content'] # 排除content字段以减小CSV大小
response = HttpResponse(content_type='text/csv')
response['Content-Disposition'] = 'attachment; filename=dongfangyancao_articles.csv'
writer = csv.writer(response)
writer.writerow(field_names)
for obj in queryset:
row = []
for field in field_names:
value = getattr(obj, field)
if callable(value):
value = value()
if field == 'website':
value = value.name
row.append(value)
writer.writerow(row)
return response
export_as_csv.short_description = "导出选中文章为CSV格式"
def export_as_json(self, request, queryset):
"""导出选中的文章为JSON格式"""
response = HttpResponse(content_type='application/json')
response['Content-Disposition'] = 'attachment; filename=dongfangyancao_articles.json'
# 构造要导出的数据
articles_data = []
for article in queryset:
articles_data.append({
'id': article.id,
'title': article.title,
'website': article.website.name,
'url': article.url,
'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None,
'content': article.content,
'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'),
'media_files': article.media_files
})
# 写入JSON数据
response.write(json.dumps(articles_data, ensure_ascii=False, indent=2))
return response
export_as_json.short_description = "导出选中文章为JSON格式"
# 在各自的管理站点中注册模型

384
core/admin_extended.py Normal file
View File

@@ -0,0 +1,384 @@
"""
Django Admin扩展
提供增强的管理界面功能
"""
import logging
from datetime import datetime, timedelta
from django.contrib import admin
from django.contrib.admin import SimpleListFilter
from django.contrib.admin.utils import model_format_dict
from django.contrib import messages
from django.http import HttpResponseRedirect
from django.urls import path, reverse
from django.utils.html import format_html
from django.utils import timezone
from django.db.models import Count, Q
from django.core.cache import cache
from .models import Website, Article
from .tasks import crawl_website, crawl_all_websites, cleanup_old_articles
from .distributed_crawler import distributed_crawler
logger = logging.getLogger(__name__)
class WebsiteStatusFilter(SimpleListFilter):
"""网站状态过滤器"""
title = '网站状态'
parameter_name = 'status'
def lookups(self, request, model_admin):
return (
('enabled', '已启用'),
('disabled', '已禁用'),
('no_articles', '无文章'),
('recent_crawl', '最近爬取'),
)
def queryset(self, request, queryset):
if self.value() == 'enabled':
return queryset.filter(enabled=True)
elif self.value() == 'disabled':
return queryset.filter(enabled=False)
elif self.value() == 'no_articles':
return queryset.annotate(article_count=Count('article')).filter(article_count=0)
elif self.value() == 'recent_crawl':
week_ago = timezone.now() - timedelta(days=7)
return queryset.filter(last_crawl__gte=week_ago)
return queryset
class ArticleDateFilter(SimpleListFilter):
"""文章日期过滤器"""
title = '发布时间'
parameter_name = 'date_range'
def lookups(self, request, model_admin):
return (
('today', '今天'),
('week', '本周'),
('month', '本月'),
('quarter', '本季度'),
)
def queryset(self, request, queryset):
now = timezone.now()
if self.value() == 'today':
return queryset.filter(created_at__date=now.date())
elif self.value() == 'week':
week_start = now - timedelta(days=now.weekday())
return queryset.filter(created_at__gte=week_start.replace(hour=0, minute=0, second=0))
elif self.value() == 'month':
return queryset.filter(created_at__year=now.year, created_at__month=now.month)
elif self.value() == 'quarter':
quarter = (now.month - 1) // 3
quarter_start_month = quarter * 3 + 1
return queryset.filter(
created_at__year=now.year,
created_at__month__gte=quarter_start_month,
created_at__month__lt=quarter_start_month + 3
)
return queryset
class WebsiteAdmin(admin.ModelAdmin):
"""网站管理"""
list_display = [
'name', 'base_url', 'enabled', 'article_count',
'last_crawl_display', 'status_indicator', 'actions_column'
]
list_filter = [WebsiteStatusFilter, 'enabled']
search_fields = ['name', 'base_url']
readonly_fields = ['article_count']
actions = ['enable_websites', 'disable_websites', 'crawl_selected', 'crawl_all']
fieldsets = (
('基本信息', {
'fields': ('name', 'base_url', 'enabled')
}),
('统计信息', {
'fields': ('article_count',),
'classes': ('collapse',)
}),
('时间信息', {
'fields': (),
'classes': ('collapse',)
}),
)
# 添加get_websites方法以支持模板中的网站选择
def get_websites(self, request):
"""获取所有启用的网站,用于模板中的选择框"""
return Website.objects.filter(enabled=True)
def article_count(self, obj):
"""文章数量"""
return obj.article_set.count()
article_count.short_description = '文章数量'
def last_crawl_display(self, obj):
"""最后爬取时间显示"""
return '未实现'
last_crawl_display.short_description = '最后爬取'
def status_indicator(self, obj):
"""状态指示器"""
if obj.enabled:
return format_html('<span style="color: green;">●</span> 正常')
else:
return format_html('<span style="color: red;">●</span> 禁用')
status_indicator.short_description = '状态'
def actions_column(self, obj):
"""操作列"""
return format_html(
'<a href="{}" class="button">爬取</a> '
'<a href="{}" class="button">查看文章</a>',
reverse('admin:crawl_website', args=[obj.id]),
reverse('admin:core_article_changelist') + f'?website__id__exact={obj.id}'
)
actions_column.short_description = '操作'
def enable_websites(self, request, queryset):
"""启用选中的网站"""
updated = queryset.update(enabled=True)
self.message_user(request, f'成功启用 {updated} 个网站')
enable_websites.short_description = '启用选中的网站'
def disable_websites(self, request, queryset):
"""禁用选中的网站"""
updated = queryset.update(enabled=False)
self.message_user(request, f'成功禁用 {updated} 个网站')
disable_websites.short_description = '禁用选中的网站'
def crawl_selected(self, request, queryset):
"""爬取选中的网站"""
for website in queryset:
try:
task = crawl_website.delay(website.id)
self.message_user(
request,
f'网站 {website.name} 爬取任务已启动 (任务ID: {task.id})',
messages.SUCCESS
)
except Exception as e:
error_msg = str(e)
if "[Errno 61] Connection refused" in error_msg:
detailed_msg = "连接被拒绝可能是Redis或其他依赖服务未启动。请检查以下几点\n1. Redis服务是否运行 (尝试运行: redis-server)\n2. 如果使用Docker请确保容器正在运行\n3. 检查Django配置中的CELERY_BROKER_URL设置\n4. 在本地开发环境中,可以运行 'python manage.py runserver''celery -A myproject worker -l info' 来启动必要的服务"
else:
detailed_msg = error_msg
self.message_user(
request,
f'网站 {website.name} 爬取任务启动失败: {detailed_msg}',
messages.ERROR
)
crawl_selected.short_description = '爬取选中的网站'
def crawl_all(self, request, queryset):
try:
task = crawl_all_websites.delay()
self.message_user(
request,
f'批量爬取任务已启动 (任务ID: {task.id})',
messages.SUCCESS
)
except Exception as e:
error_msg = str(e)
if "[Errno 61] Connection refused" in error_msg:
detailed_msg = "连接被拒绝可能是Redis或其他依赖服务未启动。请检查以下几点\n1. Redis服务是否运行 (尝试运行: redis-server)\n2. 如果使用Docker请确保容器正在运行\n3. 检查Django配置中的CELERY_BROKER_URL设置\n4. 在本地开发环境中,可以运行 'python manage.py runserver''celery -A myproject worker -l info' 来启动必要的服务"
else:
detailed_msg = error_msg
self.message_user(
request,
f'批量爬取任务启动失败: {detailed_msg}',
messages.ERROR
)
# crawl_all.short_description = '爬取所有网站'
def get_urls(self):
"""添加自定义URL"""
urls = super().get_urls()
custom_urls = [
path(
'<int:website_id>/crawl/',
self.admin_site.admin_view(self.crawl_website_view),
name='crawl_website',
),
path(
'run-crawler/',
self.admin_site.admin_view(self.run_crawler_view),
name='run_crawler',
),
]
return custom_urls + urls
def crawl_website_view(self, request, website_id):
"""爬取单个网站视图"""
try:
website = Website.objects.get(id=website_id)
task = crawl_website.delay(website_id)
self.message_user(
request,
f'网站 {website.name} 爬取任务已启动 (任务ID: {task.id})',
messages.SUCCESS
)
except Website.DoesNotExist:
self.message_user(request, '网站不存在', messages.ERROR)
except Exception as e:
error_msg = str(e)
if "[Errno 61] Connection refused" in error_msg:
detailed_msg = "连接被拒绝可能是Redis或其他依赖服务未启动。请检查以下几点\n1. Redis服务是否运行 (尝试运行: redis-server)\n2. 如果使用Docker请确保容器正在运行\n3. 检查Django配置中的CELERY_BROKER_URL设置\n4. 在本地开发环境中,可以运行 'python manage.py runserver''celery -A myproject worker -l info' 来启动必要的服务"
else:
detailed_msg = error_msg
self.message_user(request, f'爬取任务启动失败: {detailed_msg}', messages.ERROR)
return HttpResponseRedirect(reverse('admin:core_website_changelist'))
def run_crawler_view(self, request):
"""运行爬虫视图"""
try:
task = crawl_all_websites.delay()
self.message_user(
request,
f'批量爬取任务已启动 (任务ID: {task.id})',
messages.SUCCESS
)
except Exception as e:
error_msg = str(e)
if "[Errno 61] Connection refused" in error_msg:
detailed_msg = "连接被拒绝可能是Redis或其他依赖服务未启动。请检查以下几点\n1. Redis服务是否运行 (尝试运行: redis-server)\n2. 如果使用Docker请确保容器正在运行\n3. 检查Django配置中的CELERY_BROKER_URL设置\n4. 在本地开发环境中,可以运行 'python manage.py runserver''celery -A myproject worker -l info' 来启动必要的服务"
else:
detailed_msg = error_msg
self.message_user(
request,
f'批量爬取任务启动失败: {detailed_msg}',
messages.ERROR
)
return HttpResponseRedirect(reverse('admin:core_website_changelist'))
class ArticleAdmin(admin.ModelAdmin):
"""文章管理"""
list_display = [
'title', 'website', 'created_at',
'media_count', 'actions_column'
]
list_filter = [
ArticleDateFilter, 'website', 'created_at'
]
search_fields = ['title', 'content', 'url']
readonly_fields = ['created_at', 'media_files_display']
date_hierarchy = 'created_at'
fieldsets = (
('基本信息', {
'fields': ('title', 'url', 'website')
}),
('内容', {
'fields': ('content',)
}),
('媒体文件', {
'fields': ('media_files_display',),
'classes': ('collapse',)
}),
('时间信息', {
'fields': ('created_at',),
'classes': ('collapse',)
}),
)
def content_preview(self, obj):
"""内容预览"""
return obj.content[:100] + '...' if len(obj.content) > 100 else obj.content
content_preview.short_description = '内容预览'
def media_count(self, obj):
"""媒体文件数量"""
if obj.media_files:
return len(obj.media_files)
return 0
media_count.short_description = '媒体文件'
def media_files_display(self, obj):
"""媒体文件显示"""
if not obj.media_files:
return '无媒体文件'
html = '<div style="max-height: 300px; overflow-y: auto;">'
for i, media in enumerate(obj.media_files):
if media.get('type') == 'image':
html += f'<div style="margin: 10px 0;"><img src="{media["url"]}" style="max-width: 200px; max-height: 150px;" /></div>'
elif media.get('type') == 'video':
html += f'<div style="margin: 10px 0;"><video controls style="max-width: 200px;"><source src="{media["url"]}" type="video/mp4"></video></div>'
html += '</div>'
return format_html(html)
media_files_display.short_description = '媒体文件'
def actions_column(self, obj):
"""操作列"""
# 修改: 添加跳转到本地文章详情页的链接
return format_html(
'<a href="{}" target="_blank" class="button">查看原文</a> '
'<a href="{}" target="_blank" class="button">本地查看</a>',
obj.url,
reverse('article_detail', args=[obj.id])
)
actions_column.short_description = '操作'
class CrawlerStatusAdmin(admin.ModelAdmin):
"""爬虫状态管理"""
change_list_template = 'admin/crawler_status.html'
def changelist_view(self, request, extra_context=None):
"""爬虫状态视图"""
# 获取分布式爬虫状态
nodes = distributed_crawler.get_available_nodes()
node_statuses = []
for node_id in nodes:
status = distributed_crawler.get_node_status(node_id)
node_statuses.append(status)
# 获取最近的批次
batches = distributed_crawler.get_all_batches()[:10]
# 获取任务统计
task_stats = {
'active_tasks': len([n for n in node_statuses if n['active_tasks'] > 0]),
'total_nodes': len(nodes),
'total_batches': len(batches),
}
extra_context = extra_context or {}
extra_context.update({
'nodes': node_statuses,
'batches': batches,
'task_stats': task_stats,
})
return super().changelist_view(request, extra_context)
# 注册管理类
admin.site.register(Website, WebsiteAdmin)
admin.site.register(Article, ArticleAdmin)
# 自定义管理站点标题
admin.site.site_header = 'Green Classroom 管理系统'
admin.site.site_title = 'Green Classroom'
admin.site.index_title = '欢迎使用 Green Classroom 管理系统'

746
core/api.py Normal file
View File

@@ -0,0 +1,746 @@
"""
RESTful API模块
提供完整的API接口支持爬虫管理、数据查询、任务控制
"""
import logging
from datetime import datetime, timedelta
from typing import Dict, List, Any
import json
import csv
import io
import zipfile
from django.http import JsonResponse, HttpResponse
from django.views.decorators.csrf import csrf_exempt
from django.views.decorators.http import require_http_methods
from django.core.paginator import Paginator
from django.db.models import Q, Count
from django.utils import timezone
# 添加DRF相关导入
from rest_framework.views import APIView
from rest_framework.response import Response
from rest_framework.permissions import IsAuthenticated
from rest_framework.authentication import SessionAuthentication, TokenAuthentication
# 添加python-docx库支持
from docx import Document
# 添加BeautifulSoup导入
from bs4 import BeautifulSoup
from .models import Website, Article
from .tasks import crawl_website, cleanup_old_articles
from .distributed_crawler import distributed_crawler
logger = logging.getLogger(__name__)
def api_response(data=None, message="", status=200, error=None):
"""统一的API响应格式"""
response = {
"success": status < 400,
"message": message,
"timestamp": datetime.now().isoformat(),
}
if data is not None:
response["data"] = data
if error:
response["error"] = error
# 如果是DRF视图则返回DRF Response
if hasattr(api_response, '_use_drf_response') and api_response._use_drf_response:
return Response(response, status=status)
return JsonResponse(response, status=status)
# 修改健康检查接口为DRF类视图
class HealthView(APIView):
"""健康检查接口"""
permission_classes = [] # 允许无认证访问
authentication_classes = []
def get(self, request):
try:
# 检查数据库连接
website_count = Website.objects.count()
article_count = Article.objects.count()
# 检查Redis连接
from django.core.cache import cache
cache.set('health_check', 'ok', 60)
cache_result = cache.get('health_check')
health_data = {
"status": "healthy",
"database": "ok",
"redis": "ok" if cache_result == 'ok' else 'error',
"website_count": website_count,
"article_count": article_count,
"uptime": "running"
}
# 设置使用DRF响应
api_response._use_drf_response = True
return api_response(data=health_data, message="服务运行正常")
except Exception as e:
logger.error(f"健康检查失败: {e}")
return api_response(
data={"status": "unhealthy", "error": str(e)},
message="服务异常",
status=500,
error=str(e)
)
finally:
api_response._use_drf_response = False
# 修改网站列表接口为DRF类视图
class WebsitesView(APIView):
"""获取网站列表"""
permission_classes = [IsAuthenticated]
authentication_classes = [SessionAuthentication, TokenAuthentication]
def get(self, request):
try:
# 分页参数
page = int(request.GET.get('page', 1))
page_size = int(request.GET.get('page_size', 20))
search = request.GET.get('search', '')
enabled = request.GET.get('enabled', '')
# 构建查询
queryset = Website.objects.all()
if search:
queryset = queryset.filter(
Q(name__icontains=search) |
Q(base_url__icontains=search)
)
if enabled in ['true', 'false']:
queryset = queryset.filter(enabled=enabled == 'true')
# 排序 - 使用id字段替代不存在的created_at字段
queryset = queryset.order_by('-id')
# 分页
paginator = Paginator(queryset, page_size)
websites_page = paginator.get_page(page)
# 统计数据
stats = {
'total_websites': Website.objects.count(),
'enabled_websites': Website.objects.filter(enabled=True).count(),
'disabled_websites': Website.objects.filter(enabled=False).count(),
}
# 序列化数据
websites_data = []
for website in websites_page:
website_data = {
'id': website.id,
'name': website.name,
'base_url': website.base_url,
'enabled': website.enabled,
# 移除不存在的created_at和updated_at字段
'article_count': website.article_set.count(),
'last_crawl': website.last_crawl.isoformat() if getattr(website, 'last_crawl', None) else None,
}
websites_data.append(website_data)
response_data = {
'websites': websites_data,
'pagination': {
'page': page,
'page_size': page_size,
'total_pages': paginator.num_pages,
'total_count': paginator.count,
'has_next': websites_page.has_next(),
'has_previous': websites_page.has_previous(),
},
'stats': stats
}
# 设置使用DRF响应
api_response._use_drf_response = True
return api_response(data=response_data, message="获取网站列表成功")
except Exception as e:
logger.error(f"获取网站列表失败: {e}")
return api_response(message="获取网站列表失败", status=500, error=str(e))
finally:
api_response._use_drf_response = False
@csrf_exempt
@require_http_methods(["GET"])
def api_website_detail(request, website_id):
"""获取网站详情"""
try:
website = Website.objects.get(id=website_id)
# 获取最近的文章
recent_articles = website.article_set.order_by('-created_at')[:10]
website_data = {
'id': website.id,
'name': website.name,
'base_url': website.base_url,
'enabled': website.enabled,
'created_at': website.created_at.isoformat(),
'updated_at': website.updated_at.isoformat(),
'last_crawl': website.last_crawl.isoformat() if website.last_crawl else None,
'article_count': website.article_set.count(),
'recent_articles': [
{
'id': article.id,
'title': article.title,
'url': article.url,
'created_at': article.created_at.isoformat(),
}
for article in recent_articles
]
}
return api_response(data=website_data, message="获取网站详情成功")
except Website.DoesNotExist:
return api_response(message="网站不存在", status=404, error="Website not found")
except Exception as e:
logger.error(f"获取网站详情失败: {e}")
return api_response(message="获取网站详情失败", status=500, error=str(e))
@csrf_exempt
@require_http_methods(["POST"])
def api_crawl_website(request, website_id):
"""爬取指定网站"""
try:
website = Website.objects.get(id=website_id)
# 启动爬虫任务
task = crawl_website.delay(website_id)
response_data = {
'task_id': task.id,
'website_id': website_id,
'website_name': website.name,
'status': 'started'
}
return api_response(data=response_data, message="爬虫任务已启动")
except Website.DoesNotExist:
return api_response(message="网站不存在", status=404, error="Website not found")
except Exception as e:
logger.error(f"启动爬虫任务失败: {e}")
return api_response(message="启动爬虫任务失败", status=500, error=str(e))
@csrf_exempt
@require_http_methods(["GET"])
def api_articles(request):
"""获取文章列表"""
try:
# 分页参数
page = int(request.GET.get('page', 1))
page_size = int(request.GET.get('page_size', 20))
search = request.GET.get('search', '')
website_id = request.GET.get('website_id', '')
date_from = request.GET.get('date_from', '')
date_to = request.GET.get('date_to', '')
# 构建查询
queryset = Article.objects.select_related('website').all()
if search:
queryset = queryset.filter(
Q(title__icontains=search) |
Q(content__icontains=search)
)
if website_id:
queryset = queryset.filter(website_id=website_id)
if date_from:
try:
date_from_obj = datetime.fromisoformat(date_from.replace('Z', '+00:00'))
queryset = queryset.filter(created_at__gte=date_from_obj)
except ValueError:
pass
if date_to:
try:
date_to_obj = datetime.fromisoformat(date_to.replace('Z', '+00:00'))
queryset = queryset.filter(created_at__lte=date_to_obj)
except ValueError:
pass
# 排序
queryset = queryset.order_by('-created_at')
# 分页
paginator = Paginator(queryset, page_size)
articles_page = paginator.get_page(page)
# 统计数据
stats = {
'total_articles': Article.objects.count(),
'today_articles': Article.objects.filter(
created_at__date=timezone.now().date()
).count(),
'week_articles': Article.objects.filter(
created_at__gte=timezone.now() - timedelta(days=7)
).count(),
}
# 序列化数据
articles_data = []
for article in articles_page:
article_data = {
'id': article.id,
'title': article.title,
'url': article.url,
'content': article.content[:200] + '...' if len(article.content) > 200 else article.content,
'created_at': article.created_at.isoformat(),
'website': {
'id': article.website.id,
'name': article.website.name,
},
'media_files': article.media_files,
}
articles_data.append(article_data)
response_data = {
'articles': articles_data,
'pagination': {
'page': page,
'page_size': page_size,
'total_pages': paginator.num_pages,
'total_count': paginator.count,
'has_next': articles_page.has_next(),
'has_previous': articles_page.has_previous(),
},
'stats': stats
}
return api_response(data=response_data, message="获取文章列表成功")
except Exception as e:
logger.error(f"获取文章列表失败: {e}")
return api_response(message="获取文章列表失败", status=500, error=str(e))
@csrf_exempt
@require_http_methods(["GET"])
def api_article_detail(request, article_id):
"""获取文章详情"""
try:
article = Article.objects.select_related('website').get(id=article_id)
article_data = {
'id': article.id,
'title': article.title,
'url': article.url,
'content': article.content,
'created_at': article.created_at.isoformat(),
'website': {
'id': article.website.id,
'name': article.website.name,
'base_url': article.website.base_url,
},
'media_files': article.media_files,
}
return api_response(data=article_data, message="获取文章详情成功")
except Article.DoesNotExist:
return api_response(message="文章不存在", status=404, error="Article not found")
except Exception as e:
logger.error(f"获取文章详情失败: {e}")
return api_response(message="获取文章详情失败", status=500, error=str(e))
@csrf_exempt
@require_http_methods(["GET"])
def api_crawler_status(request):
"""获取爬虫状态"""
try:
# 获取分布式爬虫状态
nodes = distributed_crawler.get_available_nodes()
node_statuses = []
for node_id in nodes:
status = distributed_crawler.get_node_status(node_id)
node_statuses.append(status)
# 获取最近的批次
batches = distributed_crawler.get_all_batches()[:10]
# 获取任务统计
task_stats = {
'active_tasks': len([n for n in node_statuses if n['active_tasks'] > 0]),
'total_nodes': len(nodes),
'total_batches': len(batches),
}
response_data = {
'nodes': node_statuses,
'batches': batches,
'stats': task_stats,
}
return api_response(data=response_data, message="获取爬虫状态成功")
except Exception as e:
logger.error(f"获取爬虫状态失败: {e}")
return api_response(message="获取爬虫状态失败", status=500, error=str(e))
@csrf_exempt
@require_http_methods(["POST"])
def api_start_distributed_crawl(request):
"""启动分布式爬取"""
try:
data = json.loads(request.body)
website_ids = data.get('website_ids', [])
if not website_ids:
return api_response(message="请选择要爬取的网站", status=400, error="No websites selected")
# 启动分布式爬取
batch_id = distributed_crawler.distribute_crawl_tasks(website_ids)
if batch_id in ['no_websites', 'no_available_nodes']:
return api_response(message="无法启动分布式爬取", status=400, error=batch_id)
response_data = {
'batch_id': batch_id,
'website_ids': website_ids,
'status': 'started'
}
return api_response(data=response_data, message="分布式爬取已启动")
except json.JSONDecodeError:
return api_response(message="请求数据格式错误", status=400, error="Invalid JSON")
except Exception as e:
logger.error(f"启动分布式爬取失败: {e}")
return api_response(message="启动分布式爬取失败", status=500, error=str(e))
@csrf_exempt
@require_http_methods(["GET"])
def api_batch_status(request, batch_id):
"""获取批次状态"""
try:
batch_status = distributed_crawler.get_batch_status(batch_id)
if batch_status.get('status') == 'not_found':
return api_response(message="批次不存在", status=404, error="Batch not found")
return api_response(data=batch_status, message="获取批次状态成功")
except Exception as e:
logger.error(f"获取批次状态失败: {e}")
return api_response(message="获取批次状态失败", status=500, error=str(e))
@csrf_exempt
@require_http_methods(["GET", "POST"])
def api_cleanup_articles(request):
"""清理旧文章"""
# 如果是GET请求返回清理功能的描述信息
if request.method == "GET":
response_data = {
'description': '文章清理API',
'method': 'POST',
'parameters': {
'days': '保留天数默认30天'
},
'example': {
'days': 30
}
}
return api_response(data=response_data, message="API使用说明")
try:
data = json.loads(request.body)
days = data.get('days', 30)
# 启动清理任务
task = cleanup_old_articles.delay(days)
response_data = {
'task_id': task.id,
'days': days,
'status': 'started'
}
return api_response(data=response_data, message="清理任务已启动")
except json.JSONDecodeError:
return api_response(message="请求数据格式错误", status=400, error="Invalid JSON")
except Exception as e:
logger.error(f"启动清理任务失败: {e}")
return api_response(message="启动清理任务失败", status=500, error=str(e))
@csrf_exempt
@require_http_methods(["GET"])
def api_stats(request):
"""获取统计信息"""
try:
# 基础统计
total_websites = Website.objects.count()
total_articles = Article.objects.count()
enabled_websites = Website.objects.filter(enabled=True).count()
# 时间统计
today = timezone.now().date()
week_ago = timezone.now() - timedelta(days=7)
month_ago = timezone.now() - timedelta(days=30)
today_articles = Article.objects.filter(created_at__date=today).count()
week_articles = Article.objects.filter(created_at__gte=week_ago).count()
month_articles = Article.objects.filter(created_at__gte=month_ago).count()
# 网站统计
website_stats = []
for website in Website.objects.all():
website_stats.append({
'id': website.id,
'name': website.name,
'article_count': website.article_set.count(),
# 使用getattr安全访问last_crawl属性如果不存在则返回None
'last_crawl': website.last_crawl.isoformat() if getattr(website, 'last_crawl', None) else None,
})
# 分布式爬虫统计
nodes = distributed_crawler.get_available_nodes()
batches = distributed_crawler.get_all_batches()
response_data = {
'overview': {
'total_websites': total_websites,
'enabled_websites': enabled_websites,
'total_articles': total_articles,
'today_articles': today_articles,
'week_articles': week_articles,
'month_articles': month_articles,
},
'websites': website_stats,
'crawler': {
'active_nodes': len(nodes),
'total_batches': len(batches),
'recent_batches': batches[:5],
}
}
return api_response(data=response_data, message="获取统计信息成功")
except Exception as e:
logger.error(f"获取统计信息失败: {e}")
return api_response(message="获取统计信息失败", status=500, error=str(e))
@csrf_exempt
@require_http_methods(["POST"])
def export_articles(request):
"""导出文章"""
try:
data = json.loads(request.body)
article_ids = data.get('article_ids', [])
export_format = data.get('format', 'docx') # 默认改为docx格式
if not article_ids:
return api_response(message="请选择要导出的文章", status=400, error="No articles selected")
# 获取文章数据
articles = Article.objects.filter(id__in=article_ids).select_related('website')
if not articles.exists():
return api_response(message="未找到指定的文章", status=404, error="Articles not found")
import os # 添加导入
from django.conf import settings # 添加导入
if export_format == 'json':
# 导出为JSON格式
articles_data = []
for article in articles:
articles_data.append({
'id': article.id,
'title': article.title,
'url': article.url,
'content': article.content,
'created_at': article.created_at.isoformat(),
'website': {
'id': article.website.id,
'name': article.website.name,
},
'media_files': article.media_files,
})
response = HttpResponse(
json.dumps(articles_data, ensure_ascii=False, indent=2),
content_type='application/json'
)
response['Content-Disposition'] = 'attachment; filename="articles.json"'
return response
elif export_format == 'csv':
# 导出为CSV格式
output = io.StringIO()
writer = csv.writer(output)
writer.writerow(['ID', '标题', '网址', '内容', '创建时间', '网站'])
for article in articles:
writer.writerow([
article.id,
article.title,
article.url,
article.content[:1000] + '...' if len(article.content) > 1000 else article.content,
article.created_at.isoformat(),
article.website.name
])
response = HttpResponse(output.getvalue(), content_type='text/csv')
response['Content-Disposition'] = 'attachment; filename="articles.csv"'
return response
elif export_format == 'docx':
# 导出为Word格式每个文章一个文件夹
zip_buffer = io.BytesIO()
with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
for article in articles:
# 创建文章文件夹名称
safe_title = "".join(c for c in article.title if c.isalnum() or c in (' ','_','-')).rstrip()
folder_name = f"article_{article.id}_{safe_title}"[:50]
# 创建Word文档
doc = Document()
doc.add_heading(article.title, 0)
# 添加文章信息
doc.add_paragraph(f"网站: {article.website.name}")
doc.add_paragraph(f"网址: {article.url}")
doc.add_paragraph(f"发布时间: {article.pub_date.isoformat() if article.pub_date else 'N/A'}")
doc.add_paragraph(f"创建时间: {article.created_at.isoformat()}")
# 添加内容标题
doc.add_heading('内容:', level=1)
# 处理HTML内容
content_text = BeautifulSoup(article.content, 'html.parser').get_text()
doc.add_paragraph(content_text)
# 将文档保存到内存中
doc_buffer = io.BytesIO()
doc.save(doc_buffer)
doc_buffer.seek(0)
# 添加到ZIP文件
zip_file.writestr(f"{folder_name}/article.docx", doc_buffer.getvalue())
# 添加媒体文件(如果存在)
if article.media_files:
for media in article.media_files:
try:
# 如果是本地文件路径
if not media.startswith('http'):
media_path = os.path.join(settings.MEDIA_ROOT, media.lstrip('/'))
if os.path.exists(media_path):
zip_file.write(media_path, f"{folder_name}/media/{os.path.basename(media_path)}")
# 如果是URL格式的媒体文件
else:
import requests
from io import BytesIO
response = requests.get(media, timeout=10)
if response.status_code == 200:
image_stream = BytesIO(response.content)
media_filename = f"{folder_name}/media/{os.path.basename(media)}"
zip_file.writestr(media_filename, image_stream.getvalue())
except Exception:
# 忽略无法添加的媒体文件
pass
response = HttpResponse(zip_buffer.getvalue(), content_type='application/zip')
response['Content-Disposition'] = 'attachment; filename="articles.zip"'
return response
elif export_format == 'zip':
# 导出为ZIP包每个文章一个文件夹
zip_buffer = io.BytesIO()
with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
for article in articles:
# 创建文章文件夹名称
safe_title = "".join(c for c in article.title if c.isalnum() or c in (' ','_','-')).rstrip()
folder_name = f"article_{article.id}_{safe_title}"[:50]
# 创建Word文档
doc = Document()
doc.add_heading(article.title, 0)
# 添加文章信息
doc.add_paragraph(f"网站: {article.website.name}")
doc.add_paragraph(f"网址: {article.url}")
doc.add_paragraph(f"发布时间: {article.pub_date.isoformat() if article.pub_date else 'N/A'}")
doc.add_paragraph(f"创建时间: {article.created_at.isoformat()}")
# 添加内容标题
doc.add_heading('内容:', level=1)
# 处理HTML内容
content_text = BeautifulSoup(article.content, 'html.parser').get_text()
doc.add_paragraph(content_text)
# 将文档保存到内存中
doc_buffer = io.BytesIO()
doc.save(doc_buffer)
doc_buffer.seek(0)
# 添加到ZIP文件
zip_file.writestr(f"{folder_name}/article.docx", doc_buffer.getvalue())
# 添加媒体文件(如果存在)
if article.media_files:
for media in article.media_files:
try:
# 如果是本地文件路径
if not media.startswith('http'):
media_path = os.path.join(settings.MEDIA_ROOT, media.lstrip('/'))
if os.path.exists(media_path):
zip_file.write(media_path, f"{folder_name}/media/{os.path.basename(media_path)}")
# 如果是URL格式的媒体文件
else:
import requests
from io import BytesIO
response = requests.get(media, timeout=10)
if response.status_code == 200:
image_stream = BytesIO(response.content)
media_filename = f"{folder_name}/media/{os.path.basename(media)}"
zip_file.writestr(media_filename, image_stream.getvalue())
except Exception:
# 忽略无法添加的媒体文件
pass
response = HttpResponse(zip_buffer.getvalue(), content_type='application/zip')
response['Content-Disposition'] = 'attachment; filename="articles.zip"'
return response
else:
return api_response(message="不支持的导出格式", status=400, error="Unsupported format")
except json.JSONDecodeError:
return api_response(message="请求数据格式错误", status=400, error="Invalid JSON")
except Exception as e:
logger.error(f"导出文章失败: {e}")
return api_response(message="导出文章失败", status=500, error=str(e))

View File

@@ -4,3 +4,8 @@ from django.apps import AppConfig
class CoreConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
name = 'core'
def ready(self):
"""应用启动时执行"""
# 导入Admin扩展
import core.admin_extended

View File

@@ -9,7 +9,7 @@ class Command(BaseCommand):
def add_arguments(self, parser):
parser.add_argument('--media', type=str, help='指定要爬取的媒体,用逗号分隔')
parser.add_argument('--platform', type=str, default='all',
help='指定平台类型: all(全部), web(网站), mobile(移动端)')
help='指定平台类型: all(全部), web(网站)')
def handle(self, *args, **options):
media_list = options['media']

View File

@@ -9,7 +9,7 @@ class Command(BaseCommand):
def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all',
choices=['cctv', 'cctvnews', 'mobile', 'all'],
choices=['cctv', 'cctvnews', 'all'],
help='选择爬取平台: cctv(央视网), cctvnews(央视新闻), all(全部)')
def handle(self, *args, **options):

View File

@@ -3,13 +3,12 @@ from core.models import Website
from core.utils import full_site_crawler
# jimmy.fang-20250815: 因URL问题移除中国网-省份
class Command(BaseCommand):
help = "全站递归爬取 中国网主网及中国网一省份,不转发二级子网站"
def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all',
choices=['china', 'province', 'all'],
choices=['china', 'all'],
help='选择爬取平台: china(中国网主网), province(中国网一省份), all(全部)')
def handle(self, *args, **options):
@@ -23,12 +22,7 @@ class Command(BaseCommand):
'start_url': 'http://www.china.com.cn',
'article_selector': 'a'
},
# 'province': {
# 'name': '中国网一省份',
# 'base_url': 'http://www.china.com.cn',
# 'start_url': 'http://www.china.com.cn/province',
# 'article_selector': 'a'
# }
}
if platform == 'all':

View File

@@ -8,7 +8,7 @@ class Command(BaseCommand):
def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all',
choices=['chinanews', 'mobile', 'all'],
choices=['chinanews', 'all'],
help='选择爬取平台: chinanews(中国新闻社), all(全部)')
def handle(self, *args, **options):

View File

@@ -50,4 +50,4 @@ class Command(BaseCommand):
full_site_crawler(platform_config['start_url'], website, max_pages=500)
self.stdout.write(f"完成爬取: {platform_config['name']}")
self.stdout.write(self.style.SUCCESS("中国政府网所有平台爬取完成"))
self.stdout.write(self.style.SUCCESS("中国政府网所有平台爬取完成"))

View File

@@ -50,4 +50,4 @@ class Command(BaseCommand):
full_site_crawler(platform_config['start_url'], website, max_pages=500)
self.stdout.write(f"完成爬取: {platform_config['name']}")
self.stdout.write(self.style.SUCCESS("东方烟草报所有平台爬取完成"))
self.stdout.write(self.style.SUCCESS("东方烟草报所有平台爬取完成"))

View File

@@ -8,7 +8,7 @@ class Command(BaseCommand):
def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all',
choices=['fzrb', 'mobile', 'all'],
choices=['fzrb', 'all'],
help='选择爬取平台: fzrb(法治日报), all(全部)')
def handle(self, *args, **options):

View File

@@ -2,13 +2,14 @@ from django.core.management.base import BaseCommand
from core.models import Website
from core.utils import full_site_crawler
# jimmy.fang-20250815: 光明日报反爬,会被阻挡
# jimmy.fang-20250815: 取消对光明日报的支持,光明日报反爬,被阻挡
class Command(BaseCommand):
help = "全站递归爬取 光明日报及其子网站、客户端、新媒体平台"
def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all',
choices=['gmrb', 'mobile', 'all'],
choices=['gmrb', 'all'],
help='选择爬取平台: gmrb(光明日报), all(全部)')
def handle(self, *args, **options):

View File

@@ -8,7 +8,7 @@ class Command(BaseCommand):
def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all',
choices=['grrb', 'mobile', 'all'],
choices=['grrb', 'all'],
help='选择爬取平台: grrb(工人日报), all(全部)')
def handle(self, *args, **options):

View File

@@ -8,7 +8,7 @@ class Command(BaseCommand):
def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all',
choices=['jjrb', 'mobile', 'all'],
choices=['jjrb', 'all'],
help='选择爬取平台: jjrb(经济日报), all(全部)')
def handle(self, *args, **options):

View File

@@ -9,7 +9,7 @@ class Command(BaseCommand):
def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all',
choices=['kjrb', 'mobile', 'all'],
choices=['kjrb', 'all'],
help='选择爬取平台: kjrb(科技日报), all(全部)')
def handle(self, *args, **options):

View File

@@ -8,7 +8,7 @@ class Command(BaseCommand):
def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all',
choices=['nmrb', 'mobile', 'all'],
choices=['nmrb', 'all'],
help='选择爬取平台: nmrb(农民日报), all(全部)')
def handle(self, *args, **options):

View File

@@ -8,8 +8,8 @@ class Command(BaseCommand):
def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all',
choices=['pla', 'mobile', 'all'],
help='选择爬取平台: pla(解放军报), mobile(移动端), all(全部)')
choices=['pla', 'all'],
help='选择爬取平台: pla(解放军报), all(全部)')
def handle(self, *args, **options):
platform = options['platform']

View File

@@ -8,8 +8,8 @@ class Command(BaseCommand):
def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all',
choices=['rmzxb', 'mobile', 'all'],
help='选择爬取平台: rmzxb(人民政协网), mobile(移动端), all(全部)')
choices=['rmzxb', 'all'],
help='选择爬取平台: rmzxb(人民政协网), all(全部)')
def handle(self, *args, **options):
platform = options['platform']

View File

@@ -8,8 +8,8 @@ class Command(BaseCommand):
def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all',
choices=['news', 'xinhuanet', 'mobile', 'all'],
help='选择爬取平台: news(新华网), xinhuanet(新华网主站), mobile(移动端), all(全部)')
choices=['news', 'all'],
help='选择爬取平台: news(新华网), all(全部)')
def handle(self, *args, **options):
platform = options['platform']
@@ -22,18 +22,7 @@ class Command(BaseCommand):
'start_url': 'https://www.news.cn',
'article_selector': 'a'
},
'xinhuanet': {
'name': '新华网主站',
'base_url': 'https://www.xinhuanet.com',
'start_url': 'https://www.xinhuanet.com',
'article_selector': 'a'
},
'mobile': {
'name': '新华社移动端',
'base_url': 'https://m.xinhuanet.com',
'start_url': 'https://m.xinhuanet.com',
'article_selector': 'a'
}
}
if platform == 'all':

View File

@@ -8,8 +8,8 @@ class Command(BaseCommand):
def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all',
choices=['xuexi', 'central', 'provincial', 'all'],
help='选择爬取平台: xuexi(学习强国主站), central(中央媒体), provincial(省级平台), all(全部)')
choices=['xuexi', 'all'],
help='选择爬取平台: xuexi(学习强国主站), all(全部)')
def handle(self, *args, **options):
platform = options['platform']
@@ -22,18 +22,6 @@ class Command(BaseCommand):
'start_url': 'https://www.xuexi.cn',
'article_selector': 'a'
},
'central': {
'name': '学习强国中央媒体',
'base_url': 'https://www.xuexi.cn',
'start_url': 'https://www.xuexi.cn/central',
'article_selector': 'a'
},
'provincial': {
'name': '学习强国省级平台',
'base_url': 'https://www.xuexi.cn',
'start_url': 'https://www.xuexi.cn/provincial',
'article_selector': 'a'
}
}
if platform == 'all':

View File

@@ -8,8 +8,8 @@ class Command(BaseCommand):
def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all',
choices=['xxsb', 'mobile', 'all'],
help='选择爬取平台: xxsb(学习时报), mobile(移动端), all(全部)')
choices=['xxsb', 'all'],
help='选择爬取平台: xxsb(学习时报),all(全部)')
def handle(self, *args, **options):
platform = options['platform']
@@ -22,12 +22,6 @@ class Command(BaseCommand):
'start_url': 'http://www.studytimes.cn',
'article_selector': 'a'
},
'mobile': {
'name': '学习时报移动端',
'base_url': 'http://m.studytimes.cn',
'start_url': 'http://m.studytimes.cn',
'article_selector': 'a'
}
}
if platform == 'all':

View File

@@ -8,8 +8,8 @@ class Command(BaseCommand):
def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all',
choices=['zgfnb', 'mobile', 'all'],
help='选择爬取平台: zgfnb(中国妇女报), mobile(移动端), all(全部)')
choices=['zgfnb', 'all'],
help='选择爬取平台: zgfnb(中国妇女报), all(全部)')
def handle(self, *args, **options):
platform = options['platform']
@@ -22,12 +22,7 @@ class Command(BaseCommand):
'start_url': 'http://www.cnwomen.com.cn',
'article_selector': 'a'
},
'mobile': {
'name': '中国妇女报移动端',
'base_url': 'http://m.cnwomen.com.cn',
'start_url': 'http://m.cnwomen.com.cn',
'article_selector': 'a'
}
}
if platform == 'all':

View File

@@ -8,8 +8,8 @@ class Command(BaseCommand):
def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all',
choices=['zgjwjc', 'mobile', 'all'],
help='选择爬取平台: zgjwjc(中国纪检监察报), mobile(移动端), all(全部)')
choices=['zgjwjc', 'all'],
help='选择爬取平台: zgjwjc(中国纪检监察报),all(全部)')
def handle(self, *args, **options):
platform = options['platform']
@@ -18,16 +18,10 @@ class Command(BaseCommand):
platforms = {
'zgjwjc': {
'name': '中国纪检监察报',
'base_url': 'http://www.jjjcb.cn',
'start_url': 'http://www.jjjcb.cn',
'base_url': 'https://jjjcb.ccdi.gov.cn',
'start_url': 'https://jjjcb.ccdi.gov.cn',
'article_selector': 'a'
},
'mobile': {
'name': '中国纪检监察报移动端',
'base_url': 'http://m.jjjcb.cn',
'start_url': 'http://m.jjjcb.cn',
'article_selector': 'a'
}
}
if platform == 'all':

View File

@@ -8,8 +8,8 @@ class Command(BaseCommand):
def add_arguments(self, parser):
parser.add_argument('--platform', type=str, default='all',
choices=['zgqnb', 'mobile', 'all'],
help='选择爬取平台: zgqnb(中国青年报), mobile(移动端), all(全部)')
choices=['zgqnb', 'all'],
help='选择爬取平台: zgqnb(中国青年报), all(全部)')
def handle(self, *args, **options):
platform = options['platform']
@@ -22,12 +22,7 @@ class Command(BaseCommand):
'start_url': 'https://www.cyol.com',
'article_selector': 'a'
},
'mobile': {
'name': '中国青年报移动端',
'base_url': 'https://m.cyol.com',
'start_url': 'https://m.cyol.com',
'article_selector': 'a'
}
}
if platform == 'all':

View File

@@ -6,6 +6,10 @@ import os
from django.conf import settings
import zipfile
from django.utils import timezone
from bs4 import BeautifulSoup
# 添加python-docx库支持
import io
from docx import Document
class Command(BaseCommand):
@@ -119,201 +123,100 @@ class Command(BaseCommand):
# 添加Word格式导出方法
def export_as_word(self, articles_data, output_path):
try:
from docx import Document
from docx.shared import Inches
except ImportError:
self.stdout.write(self.style.ERROR('缺少python-docx库请安装: pip install python-docx'))
return
# 创建Word文档
doc = Document()
doc.add_heading('文章导出', 0)
for article_data in articles_data:
# 添加文章标题
doc.add_heading(article_data['title'], level=1)
# 添加文章元数据
doc.add_paragraph(f"网站: {article_data['website']}")
doc.add_paragraph(f"URL: {article_data['url']}")
doc.add_paragraph(f"发布时间: {article_data['pub_date']}")
doc.add_paragraph(f"创建时间: {article_data['created_at']}")
# 添加文章内容
doc.add_heading('内容', level=2)
# 简单处理HTML内容移除标签
from bs4 import BeautifulSoup
soup = BeautifulSoup(article_data['content'], 'html.parser')
# 处理内容中的图片
for img in soup.find_all('img'):
src = img.get('src', '')
if src:
# 尝试添加图片到文档
try:
import os
from django.conf import settings
import requests
from io import BytesIO
# 构建完整的图片路径
if src.startswith('http'):
# 网络图片
response = requests.get(src, timeout=10)
image_stream = BytesIO(response.content)
doc.add_picture(image_stream, width=Inches(4.0))
else:
# 本地图片
full_path = os.path.join(settings.MEDIA_ROOT, src.lstrip('/'))
if os.path.exists(full_path):
doc.add_picture(full_path, width=Inches(4.0))
except Exception as e:
# 如果添加图片失败添加图片URL作为文本
doc.add_paragraph(f"[图片: {src}]")
# 移除原始img标签
img.decompose()
content_text = soup.get_text()
doc.add_paragraph(content_text)
# 添加媒体文件信息
if article_data['media_files']:
doc.add_heading('媒体文件', level=2)
for media_file in article_data['media_files']:
try:
import os
from django.conf import settings
from io import BytesIO
import requests
full_path = os.path.join(settings.MEDIA_ROOT, media_file)
if os.path.exists(full_path):
# 添加图片到文档
doc.add_picture(full_path, width=Inches(4.0))
else:
# 如果是URL格式的媒体文件
if media_file.startswith('http'):
response = requests.get(media_file, timeout=10)
image_stream = BytesIO(response.content)
doc.add_picture(image_stream, width=Inches(4.0))
else:
doc.add_paragraph(media_file)
except Exception as e:
doc.add_paragraph(media_file)
# 添加分页符
doc.add_page_break()
# 保存文档
doc.save(output_path)
# 创建一个新的Word文档
document = Document()
document.add_heading('文章导出', 0)
for article_data in articles_data:
# 添加文章标题
document.add_heading(article_data['title'], level=1)
# 添加文章信息
document.add_paragraph(f"网站: {article_data['website']}")
document.add_paragraph(f"URL: {article_data['url']}")
document.add_paragraph(f"发布时间: {article_data['pub_date']}")
document.add_paragraph(f"创建时间: {article_data['created_at']}")
# 添加内容标题
document.add_heading('内容:', level=2)
# 处理HTML内容移除标签
soup = BeautifulSoup(article_data['content'], 'html.parser')
content_text = soup.get_text()
document.add_paragraph(content_text)
# 添加分页符分隔文章
document.add_page_break()
# 保存文档
document.save(output_path)
self.stdout.write(self.style.SUCCESS(f'成功导出为Word格式: {output_path}'))
except Exception as e:
self.stdout.write(self.style.ERROR(f'导出Word格式失败: {e}'))
def export_with_media(self, articles_data, media_files, output_path, format_type):
# 创建ZIP文件
with zipfile.ZipFile(output_path, 'w') as zipf:
# 添加文章数据文件
data_filename = f'articles.{format_type}'
if format_type == 'json':
json_data = json.dumps(articles_data, ensure_ascii=False, indent=2)
zipf.writestr(data_filename, json_data)
elif format_type == 'csv':
# 创建CSV内容
if articles_data:
import io
csv_buffer = io.StringIO()
fieldnames = ['id', 'title', 'website', 'url', 'pub_date', 'content', 'created_at', 'media_files']
writer = csv.DictWriter(csv_buffer, fieldnames=fieldnames)
writer.writeheader()
for article_data in articles_data:
article_data['media_files'] = ';'.join(article_data['media_files']) if article_data[
'media_files'] else ''
writer.writerow(article_data)
zipf.writestr(data_filename, csv_buffer.getvalue())
elif format_type == 'docx':
# 创建Word文档并保存到ZIP
try:
from docx import Document
from docx.shared import Inches
from io import BytesIO
doc = Document()
doc.add_heading('文章导出', 0)
for article_data in articles_data:
doc.add_heading(article_data['title'], level=1)
# 为每篇文章创建独立的文件
for article_data in articles_data:
article_folder = f"article_{article_data['id']}_{article_data['title']}"
# 限制文件夹名称长度并移除非法字符
article_folder = article_folder[:50].rstrip()
article_folder = "".join(c for c in article_folder if c.isalnum() or c in (' ','_','-')).rstrip()
# 添加文章数据文件
if format_type == 'docx':
# 创建Word文档并保存到ZIP
data_filename = f'{article_folder}/article.docx'
try:
# 创建文章信息Word文档
doc = Document()
doc.add_heading(article_data['title'], 0)
# 添加文章信息
doc.add_paragraph(f"网站: {article_data['website']}")
doc.add_paragraph(f"URL: {article_data['url']}")
doc.add_paragraph(f"发布时间: {article_data['pub_date']}")
doc.add_paragraph(f"创建时间: {article_data['created_at']}")
doc.add_heading('内容', level=2)
from bs4 import BeautifulSoup
# 添加内容标题
doc.add_heading('内容:', level=1)
# 处理HTML内容
soup = BeautifulSoup(article_data['content'], 'html.parser')
# 处理内容中的图片
for img in soup.find_all('img'):
src = img.get('src', '')
if src:
# 尝试添加图片到文档
try:
import os
from django.conf import settings
import requests
# 构建完整的图片路径
if src.startswith('http'):
# 网络图片
response = requests.get(src, timeout=10)
image_stream = BytesIO(response.content)
doc.add_picture(image_stream, width=Inches(4.0))
else:
# 本地图片
full_path = os.path.join(settings.MEDIA_ROOT, src.lstrip('/'))
if os.path.exists(full_path):
doc.add_picture(full_path, width=Inches(4.0))
except Exception as e:
# 如果添加图片失败添加图片URL作为文本
doc.add_paragraph(f"[图片: {src}]")
# 移除原始img标签
img.decompose()
content_text = soup.get_text()
doc.add_paragraph(content_text)
if article_data['media_files']:
doc.add_heading('媒体文件', level=2)
for media_file in article_data['media_files']:
try:
import os
from django.conf import settings
full_path = os.path.join(settings.MEDIA_ROOT, media_file)
if os.path.exists(full_path):
# 添加图片到文档
doc.add_picture(full_path, width=Inches(4.0))
else:
# 如果是URL格式的媒体文件
if media_file.startswith('http'):
response = requests.get(media_file, timeout=10)
image_stream = BytesIO(response.content)
doc.add_picture(image_stream, width=Inches(4.0))
else:
doc.add_paragraph(media_file)
except Exception as e:
doc.add_paragraph(media_file)
doc.add_page_break()
# 将文档保存到内存中再写入ZIP
doc_buffer = BytesIO()
doc.save(doc_buffer)
doc_buffer.seek(0)
zipf.writestr(data_filename, doc_buffer.read())
except ImportError:
zipf.writestr(data_filename, "错误缺少python-docx库无法生成Word文档")
# 添加媒体文件
for media_path in media_files:
arcname = os.path.join('media', os.path.relpath(media_path, settings.MEDIA_ROOT))
zipf.write(media_path, arcname)
# 将文档保存到内存中
doc_buffer = io.BytesIO()
doc.save(doc_buffer)
doc_buffer.seek(0)
# 将文档添加到ZIP文件
zipf.writestr(data_filename, doc_buffer.getvalue())
except Exception as e:
error_msg = f"错误无法生成文章Word文档 - {str(e)}"
zipf.writestr(data_filename, error_msg)
# 添加媒体文件到文章的media子文件夹
if article_data['media_files']:
for media_file in article_data['media_files']:
try:
full_path = os.path.join(settings.MEDIA_ROOT, media_file)
if os.path.exists(full_path):
# 添加媒体文件到ZIP中的media子文件夹
media_filename = f"{article_folder}/media/{os.path.basename(media_file)}"
zipf.write(full_path, media_filename)
else:
# 如果是URL格式的媒体文件
if media_file.startswith('http'):
import requests
from io import BytesIO
response = requests.get(media_file, timeout=10)
image_stream = BytesIO(response.content)
media_filename = f"{article_folder}/media/{os.path.basename(media_file)}"
zipf.writestr(media_filename, image_stream.getvalue())
except Exception as e:
# 错误处理,跳过无法添加的文件
pass

View File

@@ -3,6 +3,7 @@
{% block object-tools %}
{{ block.super }}
<!--
<div style="margin-top: 10px;">
<form method="post" action="{% url 'admin:run_crawler' %}" style="display: inline-block;">
{% csrf_token %}
@@ -16,4 +17,5 @@
<input type="submit" value="执行爬虫" class="default" style="margin-left: 10px;"/>
</form>
</div>
-->
{% endblock %}

View File

@@ -0,0 +1,304 @@
{% extends "admin/base_site.html" %}
{% load static %}
{% block title %}爬虫状态 - {{ site_title|default:_('Django site admin') }}{% endblock %}
{% block extrastyle %}
<style>
.status-card {
background: white;
border: 1px solid #ddd;
border-radius: 8px;
padding: 20px;
margin: 20px 0;
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
}
.status-header {
display: flex;
justify-content: space-between;
align-items: center;
margin-bottom: 20px;
padding-bottom: 10px;
border-bottom: 2px solid #f0f0f0;
}
.status-title {
font-size: 24px;
font-weight: bold;
color: #333;
}
.stats-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
gap: 20px;
margin-bottom: 30px;
}
.stat-card {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
padding: 20px;
border-radius: 8px;
text-align: center;
}
.stat-number {
font-size: 32px;
font-weight: bold;
margin-bottom: 5px;
}
.stat-label {
font-size: 14px;
opacity: 0.9;
}
.nodes-section, .batches-section {
margin-top: 30px;
}
.section-title {
font-size: 20px;
font-weight: bold;
margin-bottom: 15px;
color: #333;
}
.node-item, .batch-item {
background: #f8f9fa;
border: 1px solid #e9ecef;
border-radius: 6px;
padding: 15px;
margin-bottom: 10px;
}
.node-header, .batch-header {
display: flex;
justify-content: space-between;
align-items: center;
margin-bottom: 10px;
}
.node-name, .batch-id {
font-weight: bold;
color: #333;
}
.node-status, .batch-status {
padding: 4px 8px;
border-radius: 4px;
font-size: 12px;
font-weight: bold;
}
.status-active {
background: #d4edda;
color: #155724;
}
.status-running {
background: #fff3cd;
color: #856404;
}
.status-completed {
background: #d1ecf1;
color: #0c5460;
}
.status-failed {
background: #f8d7da;
color: #721c24;
}
.node-details, .batch-details {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
gap: 10px;
font-size: 14px;
}
.detail-item {
display: flex;
justify-content: space-between;
}
.detail-label {
color: #666;
}
.detail-value {
font-weight: bold;
color: #333;
}
.progress-bar {
width: 100%;
height: 8px;
background: #e9ecef;
border-radius: 4px;
overflow: hidden;
margin-top: 10px;
}
.progress-fill {
height: 100%;
background: linear-gradient(90deg, #28a745, #20c997);
transition: width 0.3s ease;
}
.refresh-btn {
background: #007bff;
color: white;
border: none;
padding: 8px 16px;
border-radius: 4px;
cursor: pointer;
font-size: 14px;
}
.refresh-btn:hover {
background: #0056b3;
}
.no-data {
text-align: center;
color: #666;
padding: 40px;
font-style: italic;
}
</style>
{% endblock %}
{% block content %}
<div class="status-card">
<div class="status-header">
<h1 class="status-title">爬虫状态监控</h1>
<button class="refresh-btn" onclick="location.reload()">刷新</button>
</div>
<!-- 统计卡片 -->
<div class="stats-grid">
<div class="stat-card">
<div class="stat-number">{{ task_stats.total_nodes }}</div>
<div class="stat-label">活跃节点</div>
</div>
<div class="stat-card">
<div class="stat-number">{{ task_stats.active_tasks }}</div>
<div class="stat-label">运行中任务</div>
</div>
<div class="stat-card">
<div class="stat-number">{{ task_stats.total_batches }}</div>
<div class="stat-label">总批次</div>
</div>
<div class="stat-card">
<div class="stat-number">{{ nodes|length }}</div>
<div class="stat-label">在线节点</div>
</div>
</div>
<!-- 节点状态 -->
<div class="nodes-section">
<h2 class="section-title">爬虫节点状态</h2>
{% if nodes %}
{% for node in nodes %}
<div class="node-item">
<div class="node-header">
<span class="node-name">{{ node.node_id }}</span>
<span class="node-status status-active">{{ node.status }}</span>
</div>
<div class="node-details">
<div class="detail-item">
<span class="detail-label">活跃任务:</span>
<span class="detail-value">{{ node.active_tasks }}</span>
</div>
<div class="detail-item">
<span class="detail-label">完成任务:</span>
<span class="detail-value">{{ node.completed_tasks }}</span>
</div>
<div class="detail-item">
<span class="detail-label">失败任务:</span>
<span class="detail-value">{{ node.failed_tasks }}</span>
</div>
<div class="detail-item">
<span class="detail-label">最后心跳:</span>
<span class="detail-value">
{% if node.last_heartbeat %}
{{ node.last_heartbeat|date:"H:i:s" }}
{% else %}
未知
{% endif %}
</span>
</div>
</div>
</div>
{% endfor %}
{% else %}
<div class="no-data">
暂无活跃的爬虫节点
</div>
{% endif %}
</div>
<!-- 批次状态 -->
<div class="batches-section">
<h2 class="section-title">最近批次</h2>
{% if batches %}
{% for batch in batches %}
<div class="batch-item">
<div class="batch-header">
<span class="batch-id">{{ batch.batch_id }}</span>
<span class="batch-status status-{{ batch.status }}">
{% if batch.status == 'running' %}
运行中
{% elif batch.status == 'completed' %}
已完成
{% elif batch.status == 'failed' %}
失败
{% else %}
{{ batch.status }}
{% endif %}
</span>
</div>
<div class="batch-details">
<div class="detail-item">
<span class="detail-label">总任务:</span>
<span class="detail-value">{{ batch.total_tasks }}</span>
</div>
<div class="detail-item">
<span class="detail-label">已完成:</span>
<span class="detail-value">{{ batch.completed_tasks }}</span>
</div>
<div class="detail-item">
<span class="detail-label">失败:</span>
<span class="detail-value">{{ batch.failed_tasks }}</span>
</div>
<div class="detail-item">
<span class="detail-label">进度:</span>
<span class="detail-value">{{ batch.progress|floatformat:1 }}%</span>
</div>
</div>
{% if batch.status == 'running' %}
<div class="progress-bar">
<div class="progress-fill" style="width: {{ batch.progress }}%"></div>
</div>
{% endif %}
</div>
{% endfor %}
{% else %}
<div class="no-data">
暂无批次记录
</div>
{% endif %}
</div>
</div>
<script>
// 自动刷新页面
setTimeout(function () {
location.reload();
}, 30000); // 30秒刷新一次
</script>
{% endblock %}

View File

@@ -40,7 +40,16 @@
margin-top: 20px;
}
.content img {
/* 优化:确保图片和视频不会超出容器显示 */
.content img, .content video {
max-width: 100%;
height: auto;
display: block;
margin: 10px 0;
}
/* 优化:确保iframe也不会超出容器显示 */
.content iframe {
max-width: 100%;
height: auto;
}
@@ -61,7 +70,7 @@
body {
padding: 10px;
}
.container {
padding: 15px;
}
@@ -69,21 +78,21 @@
</style>
</head>
<body>
<div class="container">
<a href="{% url 'article_list' %}" class="back-link">&laquo; 返回文章列表</a>
<div class="container">
<a href="{% url 'article_list' %}" class="back-link">&laquo; 返回文章列表</a>
<h1>{{ article.title }}</h1>
<h1>{{ article.title }}</h1>
<div class="meta">
网站: {{ article.website.name }} |
发布时间: {{ article.pub_date|date:"Y-m-d H:i" }} |
创建时间: {{ article.created_at|date:"Y-m-d H:i" }} |
源网址: <a href="{{ article.url }}" target="_blank">{{ article.url }}</a>
</div>
<div class="content">
{{ article.content|safe }}
</div>
<div class="meta">
网站: {{ article.website.name }} |
发布时间: {{ article.pub_date|date:"Y-m-d H:i" }} |
创建时间: {{ article.created_at|date:"Y-m-d H:i" }} |
源网址: <a href="{{ article.url }}" target="_blank">{{ article.url }}</a>
</div>
<div class="content">
{{ article.content|safe }}
</div>
</div>
</body>
</html>

View File

@@ -17,7 +17,7 @@
background: white;
padding: 30px;
margin-bottom: 20px;
box-shadow: 0 2px 5px rgba(0,0,0,0.05); /* 添加轻微阴影 */
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.05); /* 添加轻微阴影 */
border-radius: 8px; /* 添加圆角 */
}
@@ -240,7 +240,7 @@
<form method="get">
<input type="text" name="q" placeholder="输入关键词搜索文章..." value="{{ search_query }}">
{% if selected_website %}
<input type="hidden" name="website" value="{{ selected_website.id }}">
<input type="hidden" name="website" value="{{ selected_website.id }}">
{% endif %}
<input type="submit" value="搜索">
</form>
@@ -251,9 +251,11 @@
<div class="sidebar">
<div class="filters">
<strong>按网站筛选:</strong>
<a href="{% url 'article_list' %}{% if search_query %}?q={{ search_query }}{% endif %}" {% if not selected_website %}class="active" {% endif %}>全部</a>
<a href="{% url 'article_list' %}{% if search_query %}?q={{ search_query }}{% endif %}"
{% if not selected_website %}class="active" {% endif %}>全部</a>
{% for website in websites %}
<a href="?website={{ website.id }}{% if search_query %}&q={{ search_query }}{% endif %}" {% if selected_website and selected_website.id == website.id %}class="active" {% endif %}>{{ website.name }}</a>
<a href="?website={{ website.id }}{% if search_query %}&q={{ search_query }}{% endif %}"
{% if selected_website and selected_website.id == website.id %}class="active" {% endif %}>{{ website.name }}</a>
{% endfor %}
</div>
</div>
@@ -262,10 +264,10 @@
<div class="main-content">
<!-- 新增:搜索结果信息 -->
{% if search_query %}
<div class="search-info">
搜索 "{{ search_query }}" 找到 {{ page_obj.paginator.count }} 篇文章
<a href="{% if selected_website %}?website={{ selected_website.id }}{% else %}{% url 'article_list' %}{% endif %}">清除搜索</a>
</div>
<div class="search-info">
搜索 "{{ search_query }}" 找到 {{ page_obj.paginator.count }} 篇文章
<a href="{% if selected_website %}?website={{ selected_website.id }}{% else %}{% url 'article_list' %}{% endif %}">清除搜索</a>
</div>
{% endif %}
<!-- 新增:导出功能 -->
@@ -280,60 +282,70 @@
<ul>
{% for article in page_obj %}
<li>
<input type="checkbox" class="article-checkbox" value="{{ article.id }}" id="article_{{ article.id }}">
<a href="{% url 'article_detail' article.id %}">{{ article.title }}</a>
<div class="meta">({{ article.website.name }} - {{ article.created_at|date:"Y-m-d" }})</div>
</li>
{% empty %}
<li>暂无文章</li>
<li>
<input type="checkbox" class="article-checkbox" value="{{ article.id }}"
id="article_{{ article.id }}">
<a href="{% url 'article_detail' article.id %}">{{ article.title }}</a>
<div class="meta">({{ article.website.name }} - {{ article.created_at|date:"Y-m-d" }})</div>
</li>
{% empty %}
<li>暂无文章</li>
{% endfor %}
</ul>
<div class="pagination">
{% if page_obj.has_previous %}
{% if selected_website %}
<a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page=1">&laquo; 首页</a>
<a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.previous_page_number }}">上一页</a>
{% else %}
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page=1">&laquo; 首页</a>
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.previous_page_number }}">上一页</a>
{% endif %}
{% if selected_website %}
<a href="?website=
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page=1">&laquo;
首页</a>
<a href="?website=
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.previous_page_number }}">上一页</a>
{% else %}
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page=1">&laquo; 首页</a>
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.previous_page_number }}">上一页</a>
{% endif %}
{% endif %}
<span>第 {{ page_obj.number }} 页,共 {{ page_obj.paginator.num_pages }} 页</span>
<!-- 修改:优化页码显示逻辑 -->
{% with page_obj.paginator as paginator %}
{% for num in paginator.page_range %}
{% if page_obj.number == num %}
<a href="#" class="current">{{ num }}</a>
{% elif num > page_obj.number|add:'-3' and num < page_obj.number|add:'3' %}
{% if selected_website %}
<a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ num }}">{{ num }}</a>
{% else %}
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ num }}">{{ num }}</a>
{% endif %}
{% elif num == 1 or num == paginator.num_pages %}
{% if selected_website %}
<a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ num }}">{{ num }}</a>
{% else %}
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ num }}">{{ num }}</a>
{% endif %}
{% elif num == page_obj.number|add:'-3' or num == page_obj.number|add:'3' %}
<span class="ellipsis">...</span>
{% endif %}
{% endfor %}
{% for num in paginator.page_range %}
{% if page_obj.number == num %}
<a href="#" class="current">{{ num }}</a>
{% elif num > page_obj.number|add:'-3' and num < page_obj.number|add:'3' %}
{% if selected_website %}
<a href="?website=
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ num }}">{{ num }}</a>
{% else %}
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ num }}">{{ num }}</a>
{% endif %}
{% elif num == 1 or num == paginator.num_pages %}
{% if selected_website %}
<a href="?website=
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ num }}">{{ num }}</a>
{% else %}
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ num }}">{{ num }}</a>
{% endif %}
{% elif num == page_obj.number|add:'-3' or num == page_obj.number|add:'3' %}
<span class="ellipsis">...</span>
{% endif %}
{% endfor %}
{% endwith %}
{% if page_obj.has_next %}
{% if selected_website %}
<a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.next_page_number }}">下一页</a>
<a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.paginator.num_pages }}">末页 &raquo;</a>
{% else %}
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.next_page_number }}">下一页</a>
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.paginator.num_pages }}">末页 &raquo;</a>
{% endif %}
{% if selected_website %}
<a href="?website=
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.next_page_number }}">下一页</a>
<a href="?website=
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.paginator.num_pages }}">末页
&raquo;</a>
{% else %}
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.next_page_number }}">下一页</a>
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.paginator.num_pages }}">末页
&raquo;</a>
{% endif %}
{% endif %}
</div>
</div>
@@ -396,25 +408,25 @@
format: 'json'
})
})
.then(response => {
if (response.ok) {
return response.blob();
}
throw new Error('导出失败');
})
.then(blob => {
const url = window.URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = 'articles.json';
document.body.appendChild(a);
a.click();
window.URL.revokeObjectURL(url);
document.body.removeChild(a);
})
.catch(error => {
alert('导出失败: ' + error);
});
.then(response => {
if (response.ok) {
return response.blob();
}
throw new Error('导出失败');
})
.then(blob => {
const url = window.URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = 'articles.json';
document.body.appendChild(a);
a.click();
window.URL.revokeObjectURL(url);
document.body.removeChild(a);
})
.catch(error => {
alert('导出失败: ' + error);
});
});
// 导出为CSV功能
@@ -434,25 +446,25 @@
format: 'csv'
})
})
.then(response => {
if (response.ok) {
return response.blob();
}
throw new Error('导出失败');
})
.then(blob => {
const url = window.URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = 'articles.csv';
document.body.appendChild(a);
a.click();
window.URL.revokeObjectURL(url);
document.body.removeChild(a);
})
.catch(error => {
alert('导出失败: ' + error);
});
.then(response => {
if (response.ok) {
return response.blob();
}
throw new Error('导出失败');
})
.then(blob => {
const url = window.URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = 'articles.csv';
document.body.appendChild(a);
a.click();
window.URL.revokeObjectURL(url);
document.body.removeChild(a);
})
.catch(error => {
alert('导出失败: ' + error);
});
});
// 新增:导出为ZIP包功能
@@ -472,25 +484,25 @@
format: 'zip' // 指定导出格式为ZIP
})
})
.then(response => {
if (response.ok) {
return response.blob();
}
throw new Error('导出失败');
})
.then(blob => {
const url = window.URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = 'articles.zip';
document.body.appendChild(a);
a.click();
window.URL.revokeObjectURL(url);
document.body.removeChild(a);
})
.catch(error => {
alert('导出失败: ' + error);
});
.then(response => {
if (response.ok) {
return response.blob();
}
throw new Error('导出失败');
})
.then(blob => {
const url = window.URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = 'articles.zip';
document.body.appendChild(a);
a.click();
window.URL.revokeObjectURL(url);
document.body.removeChild(a);
})
.catch(error => {
alert('导出失败: ' + error);
});
});
// 初始化导出按钮状态

View File

@@ -1,3 +1,312 @@
from django.test import TestCase
import os
import tempfile
import shutil
from django.test import TestCase, override_settings
from django.core.management import call_command
from django.core.management.base import CommandError
from django.utils import timezone
from django.core.files.uploadedfile import SimpleUploadedFile
from unittest.mock import patch, MagicMock
from .models import Website, Article
from .utils import process_article, download_media, is_valid_url, full_site_crawler
from .tasks import crawl_website, crawl_all_websites, health_check
# Create your tests here.
class WebsiteModelTest(TestCase):
"""网站模型测试"""
def setUp(self):
self.website = Website.objects.create(
name='测试网站',
base_url='https://test.com',
description='测试描述'
)
def test_website_creation(self):
"""测试网站创建"""
self.assertEqual(self.website.name, '测试网站')
self.assertEqual(self.website.base_url, 'https://test.com')
self.assertTrue(self.website.enabled)
def test_website_str(self):
"""测试网站字符串表示"""
self.assertEqual(str(self.website), '测试网站')
class ArticleModelTest(TestCase):
"""文章模型测试"""
def setUp(self):
self.website = Website.objects.create(
name='测试网站',
base_url='https://test.com'
)
self.article = Article.objects.create(
website=self.website,
title='测试文章',
url='https://test.com/article/1',
content='<p>测试内容</p>',
media_files=['image1.jpg', 'image2.jpg']
)
def test_article_creation(self):
"""测试文章创建"""
self.assertEqual(self.article.title, '测试文章')
self.assertEqual(self.article.url, 'https://test.com/article/1')
self.assertEqual(len(self.article.media_files), 2)
def test_article_str(self):
"""测试文章字符串表示"""
self.assertEqual(str(self.article), '测试文章')
class UtilsTest(TestCase):
"""工具函数测试"""
def setUp(self):
self.website = Website.objects.create(
name='测试网站',
base_url='https://test.com'
)
self.temp_dir = tempfile.mkdtemp()
def tearDown(self):
shutil.rmtree(self.temp_dir)
def test_is_valid_url(self):
"""测试URL验证"""
from .utils import is_valid_url
# 有效URL
self.assertTrue(is_valid_url('https://test.com/article', 'test.com'))
self.assertTrue(is_valid_url('http://test.com/article', 'test.com'))
# 无效URL
self.assertFalse(is_valid_url('https://other.com/article', 'test.com'))
self.assertFalse(is_valid_url('ftp://test.com/article', 'test.com'))
self.assertFalse(is_valid_url('invalid-url', 'test.com'))
@patch('core.utils.requests.get')
def test_download_media(self, mock_get):
"""测试媒体下载"""
# 模拟响应
mock_response = MagicMock()
mock_response.content = b'fake image content'
mock_response.headers = {'content-type': 'image/jpeg'}
mock_get.return_value = mock_response
# 测试下载
result = download_media('https://test.com/image.jpg', self.temp_dir)
self.assertIsNotNone(result)
self.assertTrue(os.path.exists(result))
@patch('core.utils.requests.get')
@patch('core.utils.download_media')
def test_process_article_success(self, mock_download_media, mock_get):
"""测试文章处理成功"""
# 模拟HTML响应
html_content = '''
<html>
<head><title>测试文章</title></head>
<body>
<h1>测试文章标题</h1>
<div class="content">
<p>测试文章内容</p>
<img src="https://test.com/image.jpg">
</div>
</body>
</html>
'''
mock_response = MagicMock()
mock_response.text = html_content
mock_response.encoding = 'utf-8'
mock_response.raise_for_status.return_value = None
mock_get.return_value = mock_response
# 模拟媒体下载
mock_download_media.return_value = '/tmp/test_image.jpg'
# 测试文章处理
process_article('https://test.com/article/1', self.website)
# 验证文章是否保存
article = Article.objects.filter(url='https://test.com/article/1').first()
self.assertIsNotNone(article)
self.assertEqual(article.title, '测试文章标题')
class ManagementCommandsTest(TestCase):
"""管理命令测试"""
def setUp(self):
self.website = Website.objects.create(
name='测试网站',
base_url='https://test.com'
)
@patch('core.management.commands.crawl_all_media.call_command')
def test_crawl_all_media_command(self, mock_call_command):
"""测试批量爬取命令"""
# 模拟命令执行
mock_call_command.return_value = None
# 执行命令
call_command('crawl_all_media', media='rmrb,xinhua')
# 验证命令被调用
mock_call_command.assert_called()
class CeleryTasksTest(TestCase):
"""Celery任务测试"""
def setUp(self):
self.website = Website.objects.create(
name='测试网站',
base_url='https://test.com'
)
@patch('core.tasks.full_site_crawler')
def test_crawl_website_task(self, mock_crawler):
"""测试单个网站爬取任务"""
# 模拟爬虫函数
mock_crawler.return_value = None
# 执行任务
result = crawl_website(self.website.id)
# 验证结果
self.assertEqual(result['website_id'], self.website.id)
self.assertEqual(result['website_name'], '测试网站')
self.assertEqual(result['status'], 'success')
def test_crawl_website_task_invalid_id(self):
"""测试无效网站ID的任务"""
# 执行任务
with self.assertRaises(Exception):
crawl_website(99999)
@patch('core.tasks.crawl_website.delay')
def test_crawl_all_websites_task(self, mock_delay):
"""测试批量爬取任务"""
# 模拟子任务
mock_result = MagicMock()
mock_result.id = 'task-123'
mock_delay.return_value = mock_result
# 执行任务
result = crawl_all_websites()
# 验证结果
self.assertEqual(result['total_websites'], 1)
self.assertEqual(result['status'], 'started')
def test_health_check_task(self):
"""测试健康检查任务"""
# 执行任务
result = health_check()
# 验证结果
self.assertEqual(result['database'], 'ok')
self.assertEqual(result['website_count'], 1)
self.assertEqual(result['article_count'], 0)
class IntegrationTest(TestCase):
"""集成测试"""
def setUp(self):
self.website = Website.objects.create(
name='集成测试网站',
base_url='https://integration-test.com'
)
def test_full_workflow(self):
"""测试完整工作流程"""
# 1. 创建网站
self.assertEqual(Website.objects.count(), 1)
# 2. 创建文章
article = Article.objects.create(
website=self.website,
title='集成测试文章',
url='https://integration-test.com/article/1',
content='<p>集成测试内容</p>'
)
# 3. 验证关联关系
self.assertEqual(article.website, self.website)
self.assertEqual(self.website.article_set.count(), 1)
# 4. 验证数据完整性
self.assertIsNotNone(article.created_at)
self.assertIsInstance(article.media_files, list)
@override_settings(MEDIA_ROOT=tempfile.mkdtemp())
class MediaHandlingTest(TestCase):
"""媒体文件处理测试"""
def setUp(self):
self.website = Website.objects.create(
name='媒体测试网站',
base_url='https://media-test.com'
)
def test_media_files_field(self):
"""测试媒体文件字段"""
article = Article.objects.create(
website=self.website,
title='媒体测试文章',
url='https://media-test.com/article/1',
content='<p>测试内容</p>',
media_files=['image1.jpg', 'video1.mp4']
)
# 验证媒体文件列表
self.assertEqual(len(article.media_files), 2)
self.assertIn('image1.jpg', article.media_files)
self.assertIn('video1.mp4', article.media_files)
class ErrorHandlingTest(TestCase):
"""错误处理测试"""
def test_duplicate_url_handling(self):
"""测试重复URL处理"""
website = Website.objects.create(
name='错误测试网站',
base_url='https://error-test.com'
)
# 创建第一篇文章
article1 = Article.objects.create(
website=website,
title='第一篇文章',
url='https://error-test.com/article/1',
content='<p>内容1</p>'
)
# 尝试创建相同URL的文章
with self.assertRaises(Exception):
Article.objects.create(
website=website,
title='第二篇文章',
url='https://error-test.com/article/1', # 相同URL
content='<p>内容2</p>'
)
def test_invalid_website_data(self):
"""测试无效网站数据"""
# 测试重复名称unique约束
Website.objects.create(
name='测试网站1',
base_url='https://test1.com'
)
with self.assertRaises(Exception):
Website.objects.create(
name='测试网站1', # 重复名称
base_url='https://test2.com'
)

View File

@@ -1,17 +1,24 @@
from django.urls import path, include
from . import views
# 添加以下导入
from django.contrib import admin
from django.urls import path
from . import views, api
urlpatterns = [
# 原有视图
path('', views.article_list, name='article_list'),
path('article/<int:article_id>/', views.article_detail, name='article_detail'),
path('run-crawler/', views.run_crawler, name='run_crawler'),
# 新增:检查爬虫状态的路由
path('crawler-status/', views.crawler_status, name='crawler_status'),
# 新增:暂停爬虫的路由
path('pause-crawler/', views.pause_crawler, name='pause_crawler'),
# 添加导出文章的路由
path('export-articles/', views.export_articles, name='export_articles'),
# 添加自定义管理后台的路由
]
# API接口
path('api/health/', api.HealthView.as_view(), name='api_health'),
path('api/websites/', api.WebsitesView.as_view(), name='api_websites'),
path('api/websites/<int:website_id>/', api.api_website_detail, name='api_website_detail'),
path('api/websites/<int:website_id>/crawl/', api.api_crawl_website, name='api_crawl_website'),
path('api/articles/', api.api_articles, name='api_articles'),
path('api/articles/<int:article_id>/', api.api_article_detail, name='api_article_detail'),
path('api/crawler/status/', api.api_crawler_status, name='api_crawler_status'),
path('api/crawler/distributed/', api.api_start_distributed_crawl, name='api_start_distributed_crawl'),
path('api/crawler/batch/<str:batch_id>/', api.api_batch_status, name='api_batch_status'),
path('api/cleanup/', api.api_cleanup_articles, name='api_cleanup_articles'),
path('api/stats/', api.api_stats, name='api_stats'),
# 添加导出文章的URL
path('api/export/', api.export_articles, name='export_articles'),
]

View File

@@ -26,8 +26,9 @@ def get_selenium_driver():
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
chrome_options.add_argument(
"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)
return driver
@@ -35,6 +36,7 @@ def get_selenium_driver():
print(f"创建Selenium WebDriver失败: {e}")
return None
def get_page_with_selenium(url, website_name):
"""使用Selenium获取动态加载的页面内容"""
driver = None
@@ -42,17 +44,17 @@ def get_page_with_selenium(url, website_name):
driver = get_selenium_driver()
if not driver:
return None
print(f"使用Selenium加载页面: {url}")
driver.get(url)
# 等待页面加载完成
wait_time = 10
if "学习强国" in website_name:
wait_time = 15 # 学习强国需要更长时间
elif "法治日报" in website_name:
wait_time = 12 # 法治日报需要较长时间
# 等待页面主要内容加载
try:
WebDriverWait(driver, wait_time).until(
@@ -60,14 +62,14 @@ def get_page_with_selenium(url, website_name):
)
except:
print(f"等待页面加载超时: {url}")
# 额外等待时间确保动态内容加载完成
time.sleep(3)
# 获取页面源码
page_source = driver.page_source
return page_source
except Exception as e:
print(f"Selenium获取页面失败: {url}, 错误: {e}")
return None
@@ -78,6 +80,7 @@ def get_page_with_selenium(url, website_name):
except:
pass
def download_media(url, save_dir):
try:
# 添加请求头以避免403 Forbidden错误
@@ -236,7 +239,7 @@ def process_article(url, website):
need_selenium = False
if any(name in website.name for name in ["学习强国", "xuexi", "法治日报", "legaldaily"]):
need_selenium = True
try:
if need_selenium:
# 使用Selenium获取动态加载的内容
@@ -244,28 +247,28 @@ def process_article(url, website):
if not page_source:
print(f"Selenium获取页面失败{url}")
return
# 检查页面内容是否过短
min_length = 100 if "法治日报" in website.name else 300
if len(page_source) < min_length:
print(f"页面内容过短,可能是重定向页面:{url}")
return
# 创建BeautifulSoup对象
soup = BeautifulSoup(page_source, "html.parser")
else:
# 使用requests获取静态内容
resp = requests.get(url, headers=headers, timeout=15, allow_redirects=True)
resp.raise_for_status()
# 检查是否是重定向页面
if len(resp.text) < 300:
print(f"页面内容过短,可能是重定向页面:{url}")
return
# 创建BeautifulSoup对象
soup = BeautifulSoup(resp.text, "html.parser")
except Exception as e:
print(f"请求失败:{url},错误:{e}")
return
@@ -353,7 +356,7 @@ def process_article(url, website):
heading_text = heading.get_text(strip=True)
if title_text in heading_text or heading_text in title_text:
heading.decompose()
# 移除class包含title的元素
for title_element in content_tag.find_all(class_=lambda x: x and "title" in x):
title_element_text = title_element.get_text(strip=True)
@@ -489,13 +492,13 @@ def process_article(url, website):
soup.find("p", class_="title") or
soup.find("title")
)
# 针对求是的特殊处理如果标题为空或太短尝试从title标签提取
if title_tag:
title_text = title_tag.get_text(strip=True)
if not title_text or len(title_text) < 5:
title_tag = soup.find("title")
# 针对求是的特殊处理,确保标题被正确提取
if not title_tag or not title_tag.get_text(strip=True):
title_tag = soup.find("title")
@@ -522,7 +525,7 @@ def process_article(url, website):
# 如果 strong 在正文前两段内,就删除
if parent_p in content_tag.find_all("p")[:2]:
strong_tag.decompose()
# 移除h1、h2、h3标题元素中的重复标题
for heading in content_tag.find_all(["h1", "h2", "h3"]):
heading_text = heading.get_text(strip=True)
@@ -530,11 +533,12 @@ def process_article(url, website):
# 确保不删除title_tag本身
if heading != title_tag:
heading.decompose()
# 移除class包含title的元素
for title_element in content_tag.find_all(class_=lambda x: x and "title" in x):
title_element_text = title_element.get_text(strip=True)
if title_element_text and (title_text in title_element_text or title_element_text in title_text):
if title_element_text and (
title_text in title_element_text or title_element_text in title_text):
# 确保不删除title_tag本身
if title_element != title_tag:
title_element.decompose()
@@ -583,7 +587,7 @@ def process_article(url, website):
soup.find("h2") or # 解放军报使用h2标签作为标题
soup.find("title")
)
# 针对解放军报的特殊处理如果标题为空或太短尝试从title标签提取
if title_tag:
title_text = title_tag.get_text(strip=True)
@@ -606,34 +610,34 @@ def process_article(url, website):
# 移除面包屑导航
for breadcrumb in content_tag.find_all("ol", class_="breadcrumb"):
breadcrumb.decompose()
# 移除分享相关元素
for share_element in content_tag.find_all("div", class_="share-custom"):
share_element.decompose()
# 移除作者信息段落
for author_p in content_tag.find_all("p"):
text = author_p.get_text(strip=True)
if "来源:" in text or "作者:" in text or "责任编辑:" in text or "发布:" in text:
author_p.decompose()
# 移除进度条
for progress in content_tag.find_all("div", class_="progress-bar"):
progress.decompose()
# 移除播放器
for player in content_tag.find_all("div", class_="player"):
player.decompose()
# 移除媒体URL容器
for media in content_tag.find_all("div", id="mediaurl"):
media.decompose()
# 移除新闻列表(但保留其中的内容)
for news_list in content_tag.find_all("ul", id="main-news-list"):
# 不删除整个ul而是unwrap它保留其中的内容
news_list.unwrap()
# 移除编辑信息
for editor_element in content_tag.find_all("div", class_="editor"):
editor_element.decompose()
@@ -744,13 +748,13 @@ def process_article(url, website):
soup.find("h1") or
soup.find("title")
)
# 针对工人日报的特殊处理如果标题为空或太短尝试从title标签提取
if title_tag:
title_text = title_tag.get_text(strip=True)
if not title_text or len(title_text) < 5:
title_tag = soup.find("title")
# 进一步处理如果h1标题包含太多无关信息尝试从title标签提取更简洁的标题
if title_tag and title_tag.name == 'h1':
title_text = title_tag.get_text(strip=True)
@@ -877,11 +881,11 @@ def process_article(url, website):
soup.find("h1") or
soup.find("title")
)
# 针对中国纪检监察报的特殊处理,确保标题被正确提取
if not title_tag or not title_tag.get_text(strip=True):
title_tag = soup.find("title")
content_tag = (
soup.find("div", class_="content") or
soup.find("div", class_="article-content") or
@@ -955,11 +959,11 @@ def process_article(url, website):
soup.find("h1") or
soup.find("title")
)
# 针对中国青年报的特殊处理,确保标题被正确提取
if not title_tag or not title_tag.get_text(strip=True):
title_tag = soup.find("title")
content_tag = (
soup.find("div", class_="main") or # 中国青年报特有内容容器
soup.find("div", class_="content") or
@@ -977,11 +981,11 @@ def process_article(url, website):
soup.find("h1") or
soup.find("title")
)
# 针对中国妇女报的特殊处理,确保标题被正确提取
if not title_tag or not title_tag.get_text(strip=True):
title_tag = soup.find("title")
content_tag = (
soup.find("div", class_="main") or # 中国妇女报特有内容容器
soup.find("div", class_="news") or # 中国妇女报特有内容容器
@@ -1001,11 +1005,11 @@ def process_article(url, website):
soup.find("h1") or
soup.find("title")
)
# 针对法治日报的特殊处理,确保标题被正确提取
if not title_tag or not title_tag.get_text(strip=True):
title_tag = soup.find("title")
content_tag = (
soup.find("div", class_="content-two") or # 优先查找content-two类
soup.find("div", class_="article-content") or # 法治日报特有内容容器
@@ -1058,13 +1062,13 @@ def process_article(url, website):
soup.find("h1") or
soup.find("title")
)
# 针对农民日报的特殊处理如果标题出现乱码尝试从title标签提取
if title_tag and title_tag.name == 'h1':
title_text = title_tag.get_text(strip=True)
if title_text and any(char in title_text for char in ['', '', '']):
title_tag = soup.find("title")
# 针对农民日报的特殊处理,确保标题被正确提取
if not title_tag or not title_tag.get_text(strip=True):
title_tag = soup.find("title")
@@ -1078,7 +1082,7 @@ def process_article(url, website):
soup.find("div", class_="article") or
soup.find("div", class_="article-body")
)
# 针对农民日报的特殊处理如果找到多个detailCon选择内容最长的那个
if content_tag and content_tag.get('class') and 'detailCon' in content_tag.get('class', []):
detail_cons = soup.find_all("div", class_="detailCon")
@@ -1116,17 +1120,17 @@ def process_article(url, website):
soup.find("h1") or
soup.find("title")
)
# 针对学习强国的特殊处理,确保标题被正确提取
if not title_tag or not title_tag.get_text(strip=True):
title_tag = soup.find("title")
# 针对学习强国的特殊处理如果标题太短尝试从title标签提取
if title_tag:
title_text = title_tag.get_text(strip=True)
if title_text and len(title_text) < 10:
title_tag = soup.find("title")
content_tag = (
soup.find("div", class_="content") or
soup.find("div", class_="article-content") or
@@ -1153,17 +1157,17 @@ def process_article(url, website):
soup.find("h1") or
soup.find("title")
)
# 针对旗帜网的特殊处理如果标题为空或太短尝试从title标签提取
if title_tag:
title_text = title_tag.get_text(strip=True)
if not title_text or len(title_text) < 5:
title_tag = soup.find("title")
# 针对旗帜网的特殊处理,确保标题被正确提取
if not title_tag or not title_tag.get_text(strip=True):
title_tag = soup.find("title")
# 针对旗帜网的特殊处理如果标题太短尝试从title标签提取
if title_tag:
title_text = title_tag.get_text(strip=True)
@@ -1232,13 +1236,13 @@ def process_article(url, website):
soup.find("h1") or
soup.find("title")
)
# 针对中国网的特殊处理如果标题为空或太短尝试从title标签提取
if title_tag:
title_text = title_tag.get_text(strip=True)
if not title_text or len(title_text) < 5:
title_tag = soup.find("title")
content_tag = (
soup.find("div", class_="article") or # 中国网特有内容容器
soup.find("div", class_="main") or
@@ -1281,7 +1285,7 @@ def process_article(url, website):
# 最终标题处理 - 只有在没有网站特定处理时才使用默认处理
if not title_tag:
title_tag = soup.find("h1") or soup.find("title")
title = title_tag.get_text(strip=True) if title_tag else "无标题"
# 对标题进行额外处理,去除可能的多余空白字符
@@ -1564,7 +1568,7 @@ def full_site_crawler(start_url, website, max_pages=1000):
("/content/" in path) or
(path.startswith("/detail/") and len(path) > 10)
)
# 排除列表页面
if "/index.html" in path or path.endswith("/"):
is_article_page = False

View File

@@ -412,4 +412,4 @@ def export_articles(request):
return HttpResponse('不支持的格式', status=400)
except Exception as e:
return HttpResponse(f'导出失败: {str(e)}', status=500)
return HttpResponse(f'导出失败: {str(e)}', status=500)

View File

@@ -10,7 +10,12 @@ For the full list of settings and their values, see
https://docs.djangoproject.com/en/5.1/ref/settings/
"""
import os
from pathlib import Path
from dotenv import load_dotenv
# 加载环境变量
load_dotenv()
# Build paths inside the project like this: BASE_DIR / 'subdir'.
BASE_DIR = Path(__file__).resolve().parent.parent
@@ -19,12 +24,12 @@ BASE_DIR = Path(__file__).resolve().parent.parent
# See https://docs.djangoproject.com/en/5.1/howto/deployment/checklist/
# SECURITY WARNING: keep the secret key used in production secret!
SECRET_KEY = 'django-insecure-_kr!&5j#i!)lo(=u-&5ni+21cwxcq)j-35k!ne20)fyx!u6dnl'
SECRET_KEY = os.getenv('SECRET_KEY', 'django-insecure-_kr!&5j#i!)lo(=u-&5ni+21cwxcq)j-35k!ne20)fyx!u6dnl')
# SECURITY WARNING: don't run with debug turned on in production!
DEBUG = True
DEBUG = os.getenv('DEBUG', 'True').lower() == 'true'
ALLOWED_HOSTS = []
ALLOWED_HOSTS = os.getenv('ALLOWED_HOSTS', 'localhost,127.0.0.1').split(',')
# Application definition
@@ -36,8 +41,15 @@ INSTALLED_APPS = [
'django.contrib.messages',
'django.contrib.staticfiles',
'core',
'django_celery_beat',
'django_celery_results',
'rest_framework',
'rest_framework.authtoken',
]
# 导入Admin扩展
# import core.admin_extended # 暂时注释,避免循环导入
MIDDLEWARE = [
'django.middleware.security.SecurityMiddleware',
'django.contrib.sessions.middleware.SessionMiddleware',
@@ -71,12 +83,30 @@ WSGI_APPLICATION = 'green_classroom.wsgi.application'
# Database
# https://docs.djangoproject.com/en/5.1/ref/settings/#databases
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.sqlite3',
'NAME': BASE_DIR / 'db.sqlite3',
# 根据环境变量选择数据库
DB_ENGINE = os.getenv('DB_ENGINE', 'django.db.backends.sqlite3')
if DB_ENGINE == 'django.db.backends.postgresql':
DATABASES = {
'default': {
'ENGINE': DB_ENGINE,
'NAME': os.getenv('DB_NAME', 'green_classroom'),
'USER': os.getenv('DB_USER', 'postgres'),
'PASSWORD': os.getenv('DB_PASSWORD', ''),
'HOST': os.getenv('DB_HOST', 'localhost'),
'PORT': os.getenv('DB_PORT', '5432'),
'OPTIONS': {
'charset': 'utf8mb4',
},
}
}
else:
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.sqlite3',
'NAME': BASE_DIR / 'db.sqlite3',
}
}
}
# Password validation
# https://docs.djangoproject.com/en/5.1/ref/settings/#auth-password-validators
@@ -110,17 +140,118 @@ USE_TZ = True
# Static files (CSS, JavaScript, Images)
# https://docs.djangoproject.com/en/5.1/howto/static-files/
STATIC_URL = 'static/'
STATIC_URL = '/static/'
STATIC_ROOT = os.getenv('STATIC_ROOT', os.path.join(BASE_DIR, 'data', 'static'))
# Default primary key field type
# https://docs.djangoproject.com/en/5.1/ref/settings/#default-auto-field
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
import os
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
MEDIA_ROOT = os.path.join(BASE_DIR, 'date', 'media')
# 媒体文件配置
MEDIA_ROOT = os.getenv('MEDIA_ROOT', os.path.join(BASE_DIR, 'data', 'media'))
MEDIA_URL = '/media/'
# Celery配置
CELERY_BROKER_URL = os.getenv('CELERY_BROKER_URL', 'redis://localhost:6379/0')
CELERY_RESULT_BACKEND = os.getenv('CELERY_RESULT_BACKEND', 'redis://localhost:6379/0')
CELERY_ACCEPT_CONTENT = ['json']
CELERY_TASK_SERIALIZER = 'json'
CELERY_RESULT_SERIALIZER = 'json'
CELERY_TIMEZONE = TIME_ZONE
CELERY_TASK_TRACK_STARTED = True
CELERY_TASK_TIME_LIMIT = 30 * 60 # 30分钟
# Redis配置
REDIS_URL = os.getenv('REDIS_URL', 'redis://localhost:6379/0')
# 日志配置
LOGGING = {
'version': 1,
'disable_existing_loggers': False,
'formatters': {
'verbose': {
'format': '{levelname} {asctime} {module} {process:d} {thread:d} {message}',
'style': '{',
},
'simple': {
'format': '{levelname} {message}',
'style': '{',
},
},
'handlers': {
'file': {
'level': os.getenv('LOG_LEVEL', 'INFO'),
'class': 'logging.FileHandler',
'filename': os.getenv('LOG_FILE', os.path.join(BASE_DIR, 'data', 'logs', 'django.log')),
'formatter': 'verbose',
},
'console': {
'level': os.getenv('LOG_LEVEL', 'INFO'),
'class': 'logging.StreamHandler',
'formatter': 'simple',
},
},
'root': {
'handlers': ['console', 'file'],
'level': os.getenv('LOG_LEVEL', 'INFO'),
},
'loggers': {
'django': {
'handlers': ['console', 'file'],
'level': os.getenv('LOG_LEVEL', 'INFO'),
'propagate': False,
},
'core': {
'handlers': ['console', 'file'],
'level': os.getenv('LOG_LEVEL', 'INFO'),
'propagate': False,
},
},
}
# 安全设置
if not DEBUG:
SECURE_BROWSER_XSS_FILTER = True
SECURE_CONTENT_TYPE_NOSNIFF = True
X_FRAME_OPTIONS = 'DENY'
SECURE_HSTS_SECONDS = 31536000
SECURE_HSTS_INCLUDE_SUBDOMAINS = True
SECURE_HSTS_PRELOAD = True
# 爬虫设置
CRAWLER_TIMEOUT = int(os.getenv('CRAWLER_TIMEOUT', 30))
CRAWLER_MAX_RETRIES = int(os.getenv('CRAWLER_MAX_RETRIES', 3))
CRAWLER_DELAY = int(os.getenv('CRAWLER_DELAY', 1))
# Selenium设置
SELENIUM_HEADLESS = os.getenv('SELENIUM_HEADLESS', 'True').lower() == 'true'
CHROME_DRIVER_PATH = os.getenv('CHROME_DRIVER_PATH', '/usr/bin/chromedriver')
# Sentry监控可选
SENTRY_DSN = os.getenv('SENTRY_DSN')
if SENTRY_DSN:
import sentry_sdk
from sentry_sdk.integrations.django import DjangoIntegration
sentry_sdk.init(
dsn=SENTRY_DSN,
integrations=[DjangoIntegration()],
traces_sample_rate=1.0,
send_default_pii=True
)
# Django REST Framework 配置
REST_FRAMEWORK = {
'DEFAULT_RENDERER_CLASSES': [
'rest_framework.renderers.JSONRenderer',
'rest_framework.renderers.BrowsableAPIRenderer',
],
'DEFAULT_PERMISSION_CLASSES': [
'rest_framework.permissions.IsAuthenticated',
],
'DEFAULT_AUTHENTICATION_CLASSES': [
'rest_framework.authentication.SessionAuthentication',
'rest_framework.authentication.TokenAuthentication',
],
}

View File

@@ -1,31 +1,80 @@
amqp==5.3.1
asgiref==3.9.1
asttokens==3.0.0
attrs==25.3.0
beautifulsoup4==4.13.4
billiard==4.2.1
bs4==0.0.2
celery==5.5.3
certifi==2025.8.3
charset-normalizer==3.4.3
click==8.2.1
click-didyoumean==0.3.1
click-plugins==1.1.1.2
click-repl==0.3.0
coverage==7.10.3
cron-descriptor==1.4.5
decorator==5.2.1
Django==5.1
django-celery-beat==2.8.1
django-db-connection-pool==1.2.6
django-timezone-field==7.1
django_celery_results==2.6.0
djangorestframework==3.16.1
executing==2.2.0
factory_boy==3.3.3
Faker==37.5.3
h11==0.16.0
idna==3.10
iniconfig==2.1.0
ipython==9.4.0
ipython_pygments_lexers==1.1.1
jedi==0.19.2
kombu==5.5.4
lxml==6.0.0
m3u8==6.0.0
matplotlib-inline==0.1.7
outcome==1.3.0.post0
packaging==25.0
parso==0.8.4
pexpect==4.9.0
pluggy==1.6.0
prompt_toolkit==3.0.51
psycopg2-binary==2.9.10
ptyprocess==0.7.0
pure_eval==0.2.3
pycryptodome==3.23.0
Pygments==2.19.2
PySocks==1.7.1
pytest==8.4.1
pytest-cov==6.2.1
pytest-django==4.11.1
python-crontab==3.3.0
python-dateutil==2.9.0.post0
python-docx==1.2.0
python-dotenv==1.1.1
redis==6.4.0
requests==2.32.4
selenium==4.34.2
sentry-sdk==2.35.0
six==1.17.0
sniffio==1.3.1
sortedcontainers==2.4.0
soupsieve==2.7
SQLAlchemy==2.0.43
sqlparams==6.2.0
sqlparse==0.5.3
stack-data==0.6.3
tqdm==4.67.1
traitlets==5.14.3
trio==0.30.0
trio-websocket==0.12.2
typing_extensions==4.14.1
tzdata==2025.2
urllib3==2.5.0
uv==0.8.8
vine==5.1.0
wcwidth==0.2.13
webdriver-manager==4.0.2
websocket-client==1.8.0
wsproto==1.2.0