diff --git a/.gitignore b/.gitignore index dcc299e..9d3c86c 100644 --- a/.gitignore +++ b/.gitignore @@ -180,5 +180,11 @@ cython_debug/ # ##################################### +# 数据目录 +data/ date/media/ +# 配置文件 +config/ +.env + diff --git a/core/admin.py b/core/admin.py deleted file mode 100644 index 3479ae2..0000000 --- a/core/admin.py +++ /dev/null @@ -1,517 +0,0 @@ -from .models import Website, Article -# 添加actions相关的导入 -from django.contrib import messages -# 添加导出功能所需导入 -import csv -from django.http import HttpResponse -import json -# 添加视图函数需要的导入 -from django.shortcuts import render, redirect -from django.urls import path -from django.contrib import admin -from django.core.management import call_command - -# 添加运行爬虫的视图函数 -def run_crawler_view(request): - """ - 管理后台运行爬虫的视图 - """ - if request.method == 'POST': - website_name = request.POST.get('website_name') - if not website_name: - messages.error(request, '请选择要爬取的网站') - return redirect('admin:core_article_changelist') - - try: - # 动态获取网站对象 - website = Website.objects.get(name=website_name) - - # 根据网站对象确定要执行的爬虫命令 - # 移除默认的通用爬虫,每个网站必须配置自己的爬虫命令 - crawler_name = getattr(website, 'crawler_command', None) - - # 如果网站没有配置爬虫命令,则报错 - if not crawler_name: - messages.error(request, f'网站 {website_name} 未配置爬虫命令') - return redirect('admin:core_article_changelist') - - # 运行爬虫命令,传递网站名称 - call_command(crawler_name, website_name) - - messages.success(request, f'成功执行爬虫: {website_name}') - except Website.DoesNotExist: - messages.error(request, f'网站不存在: {website_name}') - except Exception as e: - messages.error(request, f'执行爬虫失败: {str(e)}') - - return redirect('admin:core_article_changelist') - - -@admin.register(Website) -class WebsiteAdmin(admin.ModelAdmin): - list_display = ('name', 'base_url', 'enabled') - - -# 为ArticleAdmin添加自定义动作 -@admin.register(Article) -class ArticleAdmin(admin.ModelAdmin): - list_display = ('title', 'website', 'pub_date') - search_fields = ('title', 'content') - # 添加动作选项 - actions = ['delete_selected_articles', 'export_as_csv', 'export_as_json', - 'export_as_word', 'export_with_media'] - - def get_websites(self): - """获取所有启用的网站""" - return Website.objects.filter(enabled=True) - - # 重写get_urls方法,添加自定义URL - def get_urls(self): - urls = super().get_urls() - custom_urls = [ - path('run-crawler/', self.admin_site.admin_view(run_crawler_view), name='run_crawler'), - ] - return custom_urls + urls - - def export_as_csv(self, request, queryset): - """导出选中的文章为CSV格式""" - meta = self.model._meta - field_names = [field.name for field in meta.fields] - - response = HttpResponse(content_type='text/csv') - response['Content-Disposition'] = 'attachment; filename={}.csv'.format(meta) - writer = csv.writer(response) - - writer.writerow(field_names) - for obj in queryset: - row = [getattr(obj, field)() if callable(getattr(obj, field)) else getattr(obj, field) for field in - field_names] - writer.writerow(row) - - return response - - export_as_csv.short_description = "导出选中文章为CSV格式" - - def export_as_json(self, request, queryset): - """导出选中的文章为JSON格式""" - response = HttpResponse(content_type='application/json') - response['Content-Disposition'] = 'attachment; filename=articles.json' - - # 构造要导出的数据 - articles_data = [] - for article in queryset: - articles_data.append({ - 'id': article.id, - 'title': article.title, - 'website': article.website.name, - 'url': article.url, - 'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None, - 'content': article.content, - 'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'), - 'media_files': article.media_files - }) - - # 写入JSON数据 - response.write(json.dumps(articles_data, ensure_ascii=False, indent=2)) - return response - - export_as_json.short_description = "导出选中文章为JSON格式" - - def export_as_word(self, request, queryset): - """导出选中的文章为Word格式""" - try: - from docx import Document - from io import BytesIO - from docx.shared import Inches - except ImportError: - self.message_user(request, "缺少python-docx库,请安装: pip install python-docx", messages.ERROR) - return - - # 创建Word文档 - doc = Document() - doc.add_heading('文章导出', 0) - - for article in queryset: - # 添加文章标题 - doc.add_heading(article.title, level=1) - - # 添加文章元数据 - doc.add_paragraph(f"网站: {article.website.name}") - doc.add_paragraph(f"URL: {article.url}") - doc.add_paragraph( - f"发布时间: {article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else 'N/A'}") - doc.add_paragraph(f"创建时间: {article.created_at.strftime('%Y-%m-%d %H:%M:%S')}") - - # 添加文章内容 - doc.add_heading('内容', level=2) - # 简单处理HTML内容,移除标签并处理图片 - from bs4 import BeautifulSoup - soup = BeautifulSoup(article.content, 'html.parser') - - # 处理内容中的图片 - for img in soup.find_all('img'): - src = img.get('src', '') - if src: - # 尝试添加图片到文档 - try: - import os - from django.conf import settings - import requests - from io import BytesIO - - # 构建完整的图片路径 - if src.startswith('http'): - # 网络图片 - response = requests.get(src, timeout=10) - image_stream = BytesIO(response.content) - doc.add_picture(image_stream, width=Inches(4.0)) - else: - # 本地图片 - full_path = os.path.join(settings.MEDIA_ROOT, src.lstrip('/')) - if os.path.exists(full_path): - doc.add_picture(full_path, width=Inches(4.0)) - except Exception as e: - # 如果添加图片失败,添加图片URL作为文本 - doc.add_paragraph(f"[图片: {src}]") - - # 移除原始img标签 - img.decompose() - - content_text = soup.get_text() - doc.add_paragraph(content_text) - - # 添加媒体文件信息 - if article.media_files: - doc.add_heading('媒体文件', level=2) - for media_file in article.media_files: - try: - import os - from django.conf import settings - from io import BytesIO - import requests - - full_path = os.path.join(settings.MEDIA_ROOT, media_file) - if os.path.exists(full_path): - # 添加图片到文档 - doc.add_picture(full_path, width=Inches(4.0)) - else: - # 如果是URL格式的媒体文件 - if media_file.startswith('http'): - response = requests.get(media_file, timeout=10) - image_stream = BytesIO(response.content) - doc.add_picture(image_stream, width=Inches(4.0)) - else: - doc.add_paragraph(media_file) - except Exception as e: - doc.add_paragraph(media_file) - - # 添加分页符 - doc.add_page_break() - - # 保存到内存 - buffer = BytesIO() - doc.save(buffer) - buffer.seek(0) - - # 创建HttpResponse - from django.http import HttpResponse - response = HttpResponse(buffer.getvalue(), - content_type='application/vnd.openxmlformats-officedocument.wordprocessingml.document') - response['Content-Disposition'] = 'attachment; filename=articles.docx' - return response - - export_as_word.short_description = "导出选中文章为Word格式" - - def export_with_media(self, request, queryset): - """导出选中的文章及媒体文件为ZIP包""" - try: - from docx import Document - from io import BytesIO - from docx.shared import Inches - import zipfile - except ImportError: - self.message_user(request, "缺少必要库,请安装: pip install python-docx", messages.ERROR) - return - - # 创建内存中的ZIP文件 - zip_buffer = BytesIO() - - with zipfile.ZipFile(zip_buffer, 'w') as zip_file: - for article in queryset: - # 为每篇文章创建单独的文件夹 - article_folder = f"article_{article.id}_{article.title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('*', '_').replace('?', '_').replace('"', '_').replace('<', '_').replace('>', '_').replace('|', '_')}" - - # 创建Word文档 - doc = Document() - doc.add_heading(article.title, 0) - - # 添加文章元数据 - doc.add_paragraph(f"网站: {article.website.name}") - doc.add_paragraph(f"URL: {article.url}") - doc.add_paragraph( - f"发布时间: {article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else 'N/A'}") - doc.add_paragraph(f"创建时间: {article.created_at.strftime('%Y-%m-%d %H:%M:%S')}") - - # 添加文章内容 - doc.add_heading('内容', level=2) - # 简单处理HTML内容,移除标签并处理图片 - from bs4 import BeautifulSoup - soup = BeautifulSoup(article.content, 'html.parser') - - # 处理内容中的图片 - for img in soup.find_all('img'): - src = img.get('src', '') - if src: - # 尝试添加图片到文档 - try: - import os - from django.conf import settings - import requests - - # 构建完整的图片路径 - if src.startswith('http'): - # 网络图片 - response = requests.get(src, timeout=10) - image_stream = BytesIO(response.content) - doc.add_picture(image_stream, width=Inches(4.0)) - # 将网络文件保存到ZIP - zip_file.writestr(os.path.join(article_folder, 'media', os.path.basename(src)), - response.content) - else: - # 本地图片 - full_path = os.path.join(settings.MEDIA_ROOT, src.lstrip('/')) - if os.path.exists(full_path): - doc.add_picture(full_path, width=Inches(4.0)) - # 添加文件到ZIP包 - zip_file.write(full_path, os.path.join(article_folder, 'media', src.lstrip('/'))) - except Exception as e: - # 如果添加图片失败,添加图片URL作为文本 - doc.add_paragraph(f"[图片: {src}]") - - # 移除原始img标签 - img.decompose() - - content_text = soup.get_text() - doc.add_paragraph(content_text) - - # 添加媒体文件信息并打包媒体文件 - if article.media_files: - doc.add_heading('媒体文件', level=2) - for media_file in article.media_files: - try: - import os - from django.conf import settings - - full_path = os.path.join(settings.MEDIA_ROOT, media_file) - # 检查文件扩展名以确定处理方式 - file_extension = os.path.splitext(media_file)[1].lower() - - # 图片文件处理 - if file_extension in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']: - if os.path.exists(full_path): - # 添加图片到文档 - doc.add_picture(full_path, width=Inches(4.0)) - # 添加文件到ZIP包 - zip_file.write(full_path, os.path.join(article_folder, 'media', media_file)) - else: - # 如果是URL格式的媒体文件 - if media_file.startswith('http'): - response = requests.get(media_file, timeout=10) - image_stream = BytesIO(response.content) - doc.add_picture(image_stream, width=Inches(4.0)) - # 将网络文件保存到ZIP - zip_file.writestr( - os.path.join(article_folder, 'media', os.path.basename(media_file)), - response.content) - else: - doc.add_paragraph(media_file) - # 视频文件处理 - elif file_extension in ['.mp4', '.avi', '.mov', '.wmv', '.flv', '.webm']: - # 视频文件只添加到ZIP包中,不在Word文档中显示 - if os.path.exists(full_path): - # 添加文件到ZIP包 - zip_file.write(full_path, os.path.join(article_folder, 'media', media_file)) - # 在Word文档中添加视频文件信息 - doc.add_paragraph(f"[视频文件: {media_file}]") - else: - # 如果是URL格式的媒体文件 - if media_file.startswith('http'): - # 将网络文件保存到ZIP - response = requests.get(media_file, timeout=10) - zip_file.writestr( - os.path.join(article_folder, 'media', os.path.basename(media_file)), - response.content) - doc.add_paragraph(f"[视频文件: {media_file}]") - else: - doc.add_paragraph(media_file) - # 其他文件类型 - else: - if os.path.exists(full_path): - # 添加文件到ZIP包 - zip_file.write(full_path, os.path.join(article_folder, 'media', media_file)) - doc.add_paragraph(f"[文件: {media_file}]") - else: - # 如果是URL格式的媒体文件 - if media_file.startswith('http'): - response = requests.get(media_file, timeout=10) - zip_file.writestr( - os.path.join(article_folder, 'media', os.path.basename(media_file)), - response.content) - doc.add_paragraph(f"[文件: {media_file}]") - else: - doc.add_paragraph(media_file) - except Exception as e: - doc.add_paragraph(media_file) - - # 保存每篇文章的Word文档到ZIP文件中的对应文件夹 - doc_buffer = BytesIO() - doc.save(doc_buffer) - doc_buffer.seek(0) - zip_file.writestr(os.path.join(article_folder, f'{article.title.replace("/", "_")}.docx'), - doc_buffer.read()) - - # 创建HttpResponse - zip_buffer.seek(0) - from django.http import HttpResponse - response = HttpResponse(zip_buffer.getvalue(), content_type='application/zip') - response['Content-Disposition'] = 'attachment; filename=articles_export.zip' - return response - - export_with_media.short_description = "导出选中文章及媒体文件(ZIP包)" - - -# 为不同网站创建专门的文章管理类 -class NewsCnArticleAdmin(admin.ModelAdmin): - list_display = ('title', 'pub_date') - search_fields = ('title', 'content') - list_filter = ('pub_date',) - actions = ['export_as_csv', 'export_as_json'] - - def get_queryset(self, request): - qs = super().get_queryset(request) - # 只显示新华网的文章 - return qs.filter(website__name='www.news.cn') - - def export_as_csv(self, request, queryset): - """导出选中的文章为CSV格式""" - meta = self.model._meta - field_names = [field.name for field in meta.fields if field.name != 'content'] # 排除content字段以减小CSV大小 - - response = HttpResponse(content_type='text/csv') - response['Content-Disposition'] = 'attachment; filename=news_cn_articles.csv' - writer = csv.writer(response) - - writer.writerow(field_names) - for obj in queryset: - row = [] - for field in field_names: - value = getattr(obj, field) - if callable(value): - value = value() - if field == 'website': - value = value.name - row.append(value) - writer.writerow(row) - - return response - - export_as_csv.short_description = "导出选中文章为CSV格式" - - def export_as_json(self, request, queryset): - """导出选中的文章为JSON格式""" - response = HttpResponse(content_type='application/json') - response['Content-Disposition'] = 'attachment; filename=news_cn_articles.json' - - # 构造要导出的数据 - articles_data = [] - for article in queryset: - articles_data.append({ - 'id': article.id, - 'title': article.title, - 'website': article.website.name, - 'url': article.url, - 'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None, - 'content': article.content, - 'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'), - 'media_files': article.media_files - }) - - # 写入JSON数据 - response.write(json.dumps(articles_data, ensure_ascii=False, indent=2)) - return response - - export_as_json.short_description = "导出选中文章为JSON格式" - - -class DongfangyancaoArticleAdmin(admin.ModelAdmin): - list_display = ('title', 'pub_date') - search_fields = ('title', 'content') - list_filter = ('pub_date',) - # 添加动作选项 - actions = ['delete_selected_articles', 'delete_all_articles', 'export_as_csv', 'export_as_json'] - - def get_queryset(self, request): - qs = super().get_queryset(request) - # 只显示东方烟草报的文章 - return qs.filter(website__name='东方烟草报') - - def delete_all_articles(self, request, queryset): - """删除当前筛选的所有文章(东方烟草报的所有文章)""" - # 删除所有东方烟草报的文章 - deleted_count = self.get_queryset(request).delete()[0] - self.message_user(request, f"成功删除 {deleted_count} 篇文章", messages.SUCCESS) - - # 设置动作的显示名称 - delete_all_articles.short_description = "删除所有当前筛选的文章" - - def export_as_csv(self, request, queryset): - """导出选中的文章为CSV格式""" - meta = self.model._meta - field_names = [field.name for field in meta.fields if field.name != 'content'] # 排除content字段以减小CSV大小 - - response = HttpResponse(content_type='text/csv') - response['Content-Disposition'] = 'attachment; filename=dongfangyancao_articles.csv' - writer = csv.writer(response) - - writer.writerow(field_names) - for obj in queryset: - row = [] - for field in field_names: - value = getattr(obj, field) - if callable(value): - value = value() - if field == 'website': - value = value.name - row.append(value) - writer.writerow(row) - - return response - - export_as_csv.short_description = "导出选中文章为CSV格式" - - def export_as_json(self, request, queryset): - """导出选中的文章为JSON格式""" - response = HttpResponse(content_type='application/json') - response['Content-Disposition'] = 'attachment; filename=dongfangyancao_articles.json' - - # 构造要导出的数据 - articles_data = [] - for article in queryset: - articles_data.append({ - 'id': article.id, - 'title': article.title, - 'website': article.website.name, - 'url': article.url, - 'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None, - 'content': article.content, - 'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'), - 'media_files': article.media_files - }) - - # 写入JSON数据 - response.write(json.dumps(articles_data, ensure_ascii=False, indent=2)) - return response - - export_as_json.short_description = "导出选中文章为JSON格式" - -# 在各自的管理站点中注册模型 diff --git a/core/admin_extended.py b/core/admin_extended.py new file mode 100644 index 0000000..401bfb2 --- /dev/null +++ b/core/admin_extended.py @@ -0,0 +1,384 @@ +""" +Django Admin扩展 +提供增强的管理界面功能 +""" + +import logging +from datetime import datetime, timedelta +from django.contrib import admin +from django.contrib.admin import SimpleListFilter +from django.contrib.admin.utils import model_format_dict +from django.contrib import messages +from django.http import HttpResponseRedirect +from django.urls import path, reverse +from django.utils.html import format_html +from django.utils import timezone +from django.db.models import Count, Q +from django.core.cache import cache + +from .models import Website, Article +from .tasks import crawl_website, crawl_all_websites, cleanup_old_articles +from .distributed_crawler import distributed_crawler + +logger = logging.getLogger(__name__) + + +class WebsiteStatusFilter(SimpleListFilter): + """网站状态过滤器""" + title = '网站状态' + parameter_name = 'status' + + def lookups(self, request, model_admin): + return ( + ('enabled', '已启用'), + ('disabled', '已禁用'), + ('no_articles', '无文章'), + ('recent_crawl', '最近爬取'), + ) + + def queryset(self, request, queryset): + if self.value() == 'enabled': + return queryset.filter(enabled=True) + elif self.value() == 'disabled': + return queryset.filter(enabled=False) + elif self.value() == 'no_articles': + return queryset.annotate(article_count=Count('article')).filter(article_count=0) + elif self.value() == 'recent_crawl': + week_ago = timezone.now() - timedelta(days=7) + return queryset.filter(last_crawl__gte=week_ago) + return queryset + + +class ArticleDateFilter(SimpleListFilter): + """文章日期过滤器""" + title = '发布时间' + parameter_name = 'date_range' + + def lookups(self, request, model_admin): + return ( + ('today', '今天'), + ('week', '本周'), + ('month', '本月'), + ('quarter', '本季度'), + ) + + def queryset(self, request, queryset): + now = timezone.now() + if self.value() == 'today': + return queryset.filter(created_at__date=now.date()) + elif self.value() == 'week': + week_start = now - timedelta(days=now.weekday()) + return queryset.filter(created_at__gte=week_start.replace(hour=0, minute=0, second=0)) + elif self.value() == 'month': + return queryset.filter(created_at__year=now.year, created_at__month=now.month) + elif self.value() == 'quarter': + quarter = (now.month - 1) // 3 + quarter_start_month = quarter * 3 + 1 + return queryset.filter( + created_at__year=now.year, + created_at__month__gte=quarter_start_month, + created_at__month__lt=quarter_start_month + 3 + ) + return queryset + + +class WebsiteAdmin(admin.ModelAdmin): + """网站管理""" + list_display = [ + 'name', 'base_url', 'enabled', 'article_count', + 'last_crawl_display', 'status_indicator', 'actions_column' + ] + list_filter = [WebsiteStatusFilter, 'enabled'] + search_fields = ['name', 'base_url'] + readonly_fields = ['article_count'] + actions = ['enable_websites', 'disable_websites', 'crawl_selected', 'crawl_all'] + + fieldsets = ( + ('基本信息', { + 'fields': ('name', 'base_url', 'enabled') + }), + ('统计信息', { + 'fields': ('article_count',), + 'classes': ('collapse',) + }), + ('时间信息', { + 'fields': (), + 'classes': ('collapse',) + }), + ) + + # 添加get_websites方法以支持模板中的网站选择 + def get_websites(self, request): + """获取所有启用的网站,用于模板中的选择框""" + return Website.objects.filter(enabled=True) + + def article_count(self, obj): + """文章数量""" + return obj.article_set.count() + + article_count.short_description = '文章数量' + + def last_crawl_display(self, obj): + """最后爬取时间显示""" + return '未实现' + + last_crawl_display.short_description = '最后爬取' + + def status_indicator(self, obj): + """状态指示器""" + if obj.enabled: + return format_html('● 正常') + else: + return format_html('● 禁用') + + status_indicator.short_description = '状态' + + def actions_column(self, obj): + """操作列""" + return format_html( + '爬取 ' + '查看文章', + reverse('admin:crawl_website', args=[obj.id]), + reverse('admin:core_article_changelist') + f'?website__id__exact={obj.id}' + ) + + actions_column.short_description = '操作' + + def enable_websites(self, request, queryset): + """启用选中的网站""" + updated = queryset.update(enabled=True) + self.message_user(request, f'成功启用 {updated} 个网站') + + enable_websites.short_description = '启用选中的网站' + + def disable_websites(self, request, queryset): + """禁用选中的网站""" + updated = queryset.update(enabled=False) + self.message_user(request, f'成功禁用 {updated} 个网站') + + disable_websites.short_description = '禁用选中的网站' + + def crawl_selected(self, request, queryset): + """爬取选中的网站""" + for website in queryset: + try: + task = crawl_website.delay(website.id) + self.message_user( + request, + f'网站 {website.name} 爬取任务已启动 (任务ID: {task.id})', + messages.SUCCESS + ) + except Exception as e: + error_msg = str(e) + if "[Errno 61] Connection refused" in error_msg: + detailed_msg = "连接被拒绝,可能是Redis或其他依赖服务未启动。请检查以下几点:\n1. Redis服务是否运行 (尝试运行: redis-server)\n2. 如果使用Docker,请确保容器正在运行\n3. 检查Django配置中的CELERY_BROKER_URL设置\n4. 在本地开发环境中,可以运行 'python manage.py runserver' 和 'celery -A myproject worker -l info' 来启动必要的服务" + else: + detailed_msg = error_msg + self.message_user( + request, + f'网站 {website.name} 爬取任务启动失败: {detailed_msg}', + messages.ERROR + ) + + crawl_selected.short_description = '爬取选中的网站' + + def crawl_all(self, request, queryset): + try: + task = crawl_all_websites.delay() + self.message_user( + request, + f'批量爬取任务已启动 (任务ID: {task.id})', + messages.SUCCESS + ) + except Exception as e: + error_msg = str(e) + if "[Errno 61] Connection refused" in error_msg: + detailed_msg = "连接被拒绝,可能是Redis或其他依赖服务未启动。请检查以下几点:\n1. Redis服务是否运行 (尝试运行: redis-server)\n2. 如果使用Docker,请确保容器正在运行\n3. 检查Django配置中的CELERY_BROKER_URL设置\n4. 在本地开发环境中,可以运行 'python manage.py runserver' 和 'celery -A myproject worker -l info' 来启动必要的服务" + else: + detailed_msg = error_msg + self.message_user( + request, + f'批量爬取任务启动失败: {detailed_msg}', + messages.ERROR + ) + + # crawl_all.short_description = '爬取所有网站' + + def get_urls(self): + """添加自定义URL""" + urls = super().get_urls() + custom_urls = [ + path( + '/crawl/', + self.admin_site.admin_view(self.crawl_website_view), + name='crawl_website', + ), + path( + 'run-crawler/', + self.admin_site.admin_view(self.run_crawler_view), + name='run_crawler', + ), + ] + return custom_urls + urls + + def crawl_website_view(self, request, website_id): + """爬取单个网站视图""" + try: + website = Website.objects.get(id=website_id) + task = crawl_website.delay(website_id) + self.message_user( + request, + f'网站 {website.name} 爬取任务已启动 (任务ID: {task.id})', + messages.SUCCESS + ) + except Website.DoesNotExist: + self.message_user(request, '网站不存在', messages.ERROR) + except Exception as e: + error_msg = str(e) + if "[Errno 61] Connection refused" in error_msg: + detailed_msg = "连接被拒绝,可能是Redis或其他依赖服务未启动。请检查以下几点:\n1. Redis服务是否运行 (尝试运行: redis-server)\n2. 如果使用Docker,请确保容器正在运行\n3. 检查Django配置中的CELERY_BROKER_URL设置\n4. 在本地开发环境中,可以运行 'python manage.py runserver' 和 'celery -A myproject worker -l info' 来启动必要的服务" + else: + detailed_msg = error_msg + self.message_user(request, f'爬取任务启动失败: {detailed_msg}', messages.ERROR) + + return HttpResponseRedirect(reverse('admin:core_website_changelist')) + + def run_crawler_view(self, request): + """运行爬虫视图""" + try: + task = crawl_all_websites.delay() + self.message_user( + request, + f'批量爬取任务已启动 (任务ID: {task.id})', + messages.SUCCESS + ) + except Exception as e: + error_msg = str(e) + if "[Errno 61] Connection refused" in error_msg: + detailed_msg = "连接被拒绝,可能是Redis或其他依赖服务未启动。请检查以下几点:\n1. Redis服务是否运行 (尝试运行: redis-server)\n2. 如果使用Docker,请确保容器正在运行\n3. 检查Django配置中的CELERY_BROKER_URL设置\n4. 在本地开发环境中,可以运行 'python manage.py runserver' 和 'celery -A myproject worker -l info' 来启动必要的服务" + else: + detailed_msg = error_msg + self.message_user( + request, + f'批量爬取任务启动失败: {detailed_msg}', + messages.ERROR + ) + + return HttpResponseRedirect(reverse('admin:core_website_changelist')) + + +class ArticleAdmin(admin.ModelAdmin): + """文章管理""" + list_display = [ + 'title', 'website', 'created_at', + 'media_count', 'actions_column' + ] + list_filter = [ + ArticleDateFilter, 'website', 'created_at' + ] + search_fields = ['title', 'content', 'url'] + readonly_fields = ['created_at', 'media_files_display'] + date_hierarchy = 'created_at' + + fieldsets = ( + ('基本信息', { + 'fields': ('title', 'url', 'website') + }), + ('内容', { + 'fields': ('content',) + }), + ('媒体文件', { + 'fields': ('media_files_display',), + 'classes': ('collapse',) + }), + ('时间信息', { + 'fields': ('created_at',), + 'classes': ('collapse',) + }), + ) + + def content_preview(self, obj): + """内容预览""" + return obj.content[:100] + '...' if len(obj.content) > 100 else obj.content + + content_preview.short_description = '内容预览' + + def media_count(self, obj): + """媒体文件数量""" + if obj.media_files: + return len(obj.media_files) + return 0 + + media_count.short_description = '媒体文件' + + def media_files_display(self, obj): + """媒体文件显示""" + if not obj.media_files: + return '无媒体文件' + + html = '' + for i, media in enumerate(obj.media_files): + if media.get('type') == 'image': + html += f'' + elif media.get('type') == 'video': + html += f'' + html += '' + return format_html(html) + + media_files_display.short_description = '媒体文件' + + def actions_column(self, obj): + """操作列""" + # 修改: 添加跳转到本地文章详情页的链接 + return format_html( + '查看原文 ' + '本地查看', + obj.url, + reverse('article_detail', args=[obj.id]) + ) + + actions_column.short_description = '操作' + + +class CrawlerStatusAdmin(admin.ModelAdmin): + """爬虫状态管理""" + change_list_template = 'admin/crawler_status.html' + + def changelist_view(self, request, extra_context=None): + """爬虫状态视图""" + # 获取分布式爬虫状态 + nodes = distributed_crawler.get_available_nodes() + node_statuses = [] + + for node_id in nodes: + status = distributed_crawler.get_node_status(node_id) + node_statuses.append(status) + + # 获取最近的批次 + batches = distributed_crawler.get_all_batches()[:10] + + # 获取任务统计 + task_stats = { + 'active_tasks': len([n for n in node_statuses if n['active_tasks'] > 0]), + 'total_nodes': len(nodes), + 'total_batches': len(batches), + } + + extra_context = extra_context or {} + extra_context.update({ + 'nodes': node_statuses, + 'batches': batches, + 'task_stats': task_stats, + }) + + return super().changelist_view(request, extra_context) + + +# 注册管理类 +admin.site.register(Website, WebsiteAdmin) +admin.site.register(Article, ArticleAdmin) + +# 自定义管理站点标题 +admin.site.site_header = 'Green Classroom 管理系统' +admin.site.site_title = 'Green Classroom' +admin.site.index_title = '欢迎使用 Green Classroom 管理系统' diff --git a/core/api.py b/core/api.py new file mode 100644 index 0000000..791dda8 --- /dev/null +++ b/core/api.py @@ -0,0 +1,746 @@ +""" +RESTful API模块 +提供完整的API接口,支持爬虫管理、数据查询、任务控制 +""" + +import logging +from datetime import datetime, timedelta +from typing import Dict, List, Any +import json +import csv +import io +import zipfile + +from django.http import JsonResponse, HttpResponse +from django.views.decorators.csrf import csrf_exempt +from django.views.decorators.http import require_http_methods +from django.core.paginator import Paginator +from django.db.models import Q, Count +from django.utils import timezone +# 添加DRF相关导入 +from rest_framework.views import APIView +from rest_framework.response import Response +from rest_framework.permissions import IsAuthenticated +from rest_framework.authentication import SessionAuthentication, TokenAuthentication + +# 添加python-docx库支持 +from docx import Document + +# 添加BeautifulSoup导入 +from bs4 import BeautifulSoup + +from .models import Website, Article +from .tasks import crawl_website, cleanup_old_articles +from .distributed_crawler import distributed_crawler + +logger = logging.getLogger(__name__) + + +def api_response(data=None, message="", status=200, error=None): + """统一的API响应格式""" + response = { + "success": status < 400, + "message": message, + "timestamp": datetime.now().isoformat(), + } + + if data is not None: + response["data"] = data + + if error: + response["error"] = error + + # 如果是DRF视图,则返回DRF Response + if hasattr(api_response, '_use_drf_response') and api_response._use_drf_response: + return Response(response, status=status) + + return JsonResponse(response, status=status) + + +# 修改健康检查接口为DRF类视图 +class HealthView(APIView): + """健康检查接口""" + permission_classes = [] # 允许无认证访问 + authentication_classes = [] + + def get(self, request): + try: + # 检查数据库连接 + website_count = Website.objects.count() + article_count = Article.objects.count() + + # 检查Redis连接 + from django.core.cache import cache + cache.set('health_check', 'ok', 60) + cache_result = cache.get('health_check') + + health_data = { + "status": "healthy", + "database": "ok", + "redis": "ok" if cache_result == 'ok' else 'error', + "website_count": website_count, + "article_count": article_count, + "uptime": "running" + } + + # 设置使用DRF响应 + api_response._use_drf_response = True + return api_response(data=health_data, message="服务运行正常") + + except Exception as e: + logger.error(f"健康检查失败: {e}") + return api_response( + data={"status": "unhealthy", "error": str(e)}, + message="服务异常", + status=500, + error=str(e) + ) + finally: + api_response._use_drf_response = False + + +# 修改网站列表接口为DRF类视图 +class WebsitesView(APIView): + """获取网站列表""" + permission_classes = [IsAuthenticated] + authentication_classes = [SessionAuthentication, TokenAuthentication] + + def get(self, request): + try: + # 分页参数 + page = int(request.GET.get('page', 1)) + page_size = int(request.GET.get('page_size', 20)) + search = request.GET.get('search', '') + enabled = request.GET.get('enabled', '') + + # 构建查询 + queryset = Website.objects.all() + + if search: + queryset = queryset.filter( + Q(name__icontains=search) | + Q(base_url__icontains=search) + ) + + if enabled in ['true', 'false']: + queryset = queryset.filter(enabled=enabled == 'true') + + # 排序 - 使用id字段替代不存在的created_at字段 + queryset = queryset.order_by('-id') + + # 分页 + paginator = Paginator(queryset, page_size) + websites_page = paginator.get_page(page) + + # 统计数据 + stats = { + 'total_websites': Website.objects.count(), + 'enabled_websites': Website.objects.filter(enabled=True).count(), + 'disabled_websites': Website.objects.filter(enabled=False).count(), + } + + # 序列化数据 + websites_data = [] + for website in websites_page: + website_data = { + 'id': website.id, + 'name': website.name, + 'base_url': website.base_url, + 'enabled': website.enabled, + # 移除不存在的created_at和updated_at字段 + 'article_count': website.article_set.count(), + 'last_crawl': website.last_crawl.isoformat() if getattr(website, 'last_crawl', None) else None, + } + websites_data.append(website_data) + + response_data = { + 'websites': websites_data, + 'pagination': { + 'page': page, + 'page_size': page_size, + 'total_pages': paginator.num_pages, + 'total_count': paginator.count, + 'has_next': websites_page.has_next(), + 'has_previous': websites_page.has_previous(), + }, + 'stats': stats + } + + # 设置使用DRF响应 + api_response._use_drf_response = True + return api_response(data=response_data, message="获取网站列表成功") + + except Exception as e: + logger.error(f"获取网站列表失败: {e}") + return api_response(message="获取网站列表失败", status=500, error=str(e)) + finally: + api_response._use_drf_response = False + + +@csrf_exempt +@require_http_methods(["GET"]) +def api_website_detail(request, website_id): + """获取网站详情""" + try: + website = Website.objects.get(id=website_id) + + # 获取最近的文章 + recent_articles = website.article_set.order_by('-created_at')[:10] + + website_data = { + 'id': website.id, + 'name': website.name, + 'base_url': website.base_url, + 'enabled': website.enabled, + 'created_at': website.created_at.isoformat(), + 'updated_at': website.updated_at.isoformat(), + 'last_crawl': website.last_crawl.isoformat() if website.last_crawl else None, + 'article_count': website.article_set.count(), + 'recent_articles': [ + { + 'id': article.id, + 'title': article.title, + 'url': article.url, + 'created_at': article.created_at.isoformat(), + } + for article in recent_articles + ] + } + + return api_response(data=website_data, message="获取网站详情成功") + + except Website.DoesNotExist: + return api_response(message="网站不存在", status=404, error="Website not found") + except Exception as e: + logger.error(f"获取网站详情失败: {e}") + return api_response(message="获取网站详情失败", status=500, error=str(e)) + + +@csrf_exempt +@require_http_methods(["POST"]) +def api_crawl_website(request, website_id): + """爬取指定网站""" + try: + website = Website.objects.get(id=website_id) + + # 启动爬虫任务 + task = crawl_website.delay(website_id) + + response_data = { + 'task_id': task.id, + 'website_id': website_id, + 'website_name': website.name, + 'status': 'started' + } + + return api_response(data=response_data, message="爬虫任务已启动") + + except Website.DoesNotExist: + return api_response(message="网站不存在", status=404, error="Website not found") + except Exception as e: + logger.error(f"启动爬虫任务失败: {e}") + return api_response(message="启动爬虫任务失败", status=500, error=str(e)) + + +@csrf_exempt +@require_http_methods(["GET"]) +def api_articles(request): + """获取文章列表""" + try: + # 分页参数 + page = int(request.GET.get('page', 1)) + page_size = int(request.GET.get('page_size', 20)) + search = request.GET.get('search', '') + website_id = request.GET.get('website_id', '') + date_from = request.GET.get('date_from', '') + date_to = request.GET.get('date_to', '') + + # 构建查询 + queryset = Article.objects.select_related('website').all() + + if search: + queryset = queryset.filter( + Q(title__icontains=search) | + Q(content__icontains=search) + ) + + if website_id: + queryset = queryset.filter(website_id=website_id) + + if date_from: + try: + date_from_obj = datetime.fromisoformat(date_from.replace('Z', '+00:00')) + queryset = queryset.filter(created_at__gte=date_from_obj) + except ValueError: + pass + + if date_to: + try: + date_to_obj = datetime.fromisoformat(date_to.replace('Z', '+00:00')) + queryset = queryset.filter(created_at__lte=date_to_obj) + except ValueError: + pass + + # 排序 + queryset = queryset.order_by('-created_at') + + # 分页 + paginator = Paginator(queryset, page_size) + articles_page = paginator.get_page(page) + + # 统计数据 + stats = { + 'total_articles': Article.objects.count(), + 'today_articles': Article.objects.filter( + created_at__date=timezone.now().date() + ).count(), + 'week_articles': Article.objects.filter( + created_at__gte=timezone.now() - timedelta(days=7) + ).count(), + } + + # 序列化数据 + articles_data = [] + for article in articles_page: + article_data = { + 'id': article.id, + 'title': article.title, + 'url': article.url, + 'content': article.content[:200] + '...' if len(article.content) > 200 else article.content, + 'created_at': article.created_at.isoformat(), + 'website': { + 'id': article.website.id, + 'name': article.website.name, + }, + 'media_files': article.media_files, + } + articles_data.append(article_data) + + response_data = { + 'articles': articles_data, + 'pagination': { + 'page': page, + 'page_size': page_size, + 'total_pages': paginator.num_pages, + 'total_count': paginator.count, + 'has_next': articles_page.has_next(), + 'has_previous': articles_page.has_previous(), + }, + 'stats': stats + } + + return api_response(data=response_data, message="获取文章列表成功") + + except Exception as e: + logger.error(f"获取文章列表失败: {e}") + return api_response(message="获取文章列表失败", status=500, error=str(e)) + + +@csrf_exempt +@require_http_methods(["GET"]) +def api_article_detail(request, article_id): + """获取文章详情""" + try: + article = Article.objects.select_related('website').get(id=article_id) + + article_data = { + 'id': article.id, + 'title': article.title, + 'url': article.url, + 'content': article.content, + 'created_at': article.created_at.isoformat(), + 'website': { + 'id': article.website.id, + 'name': article.website.name, + 'base_url': article.website.base_url, + }, + 'media_files': article.media_files, + } + + return api_response(data=article_data, message="获取文章详情成功") + + except Article.DoesNotExist: + return api_response(message="文章不存在", status=404, error="Article not found") + except Exception as e: + logger.error(f"获取文章详情失败: {e}") + return api_response(message="获取文章详情失败", status=500, error=str(e)) + + +@csrf_exempt +@require_http_methods(["GET"]) +def api_crawler_status(request): + """获取爬虫状态""" + try: + # 获取分布式爬虫状态 + nodes = distributed_crawler.get_available_nodes() + node_statuses = [] + + for node_id in nodes: + status = distributed_crawler.get_node_status(node_id) + node_statuses.append(status) + + # 获取最近的批次 + batches = distributed_crawler.get_all_batches()[:10] + + # 获取任务统计 + task_stats = { + 'active_tasks': len([n for n in node_statuses if n['active_tasks'] > 0]), + 'total_nodes': len(nodes), + 'total_batches': len(batches), + } + + response_data = { + 'nodes': node_statuses, + 'batches': batches, + 'stats': task_stats, + } + + return api_response(data=response_data, message="获取爬虫状态成功") + + except Exception as e: + logger.error(f"获取爬虫状态失败: {e}") + return api_response(message="获取爬虫状态失败", status=500, error=str(e)) + + +@csrf_exempt +@require_http_methods(["POST"]) +def api_start_distributed_crawl(request): + """启动分布式爬取""" + try: + data = json.loads(request.body) + website_ids = data.get('website_ids', []) + + if not website_ids: + return api_response(message="请选择要爬取的网站", status=400, error="No websites selected") + + # 启动分布式爬取 + batch_id = distributed_crawler.distribute_crawl_tasks(website_ids) + + if batch_id in ['no_websites', 'no_available_nodes']: + return api_response(message="无法启动分布式爬取", status=400, error=batch_id) + + response_data = { + 'batch_id': batch_id, + 'website_ids': website_ids, + 'status': 'started' + } + + return api_response(data=response_data, message="分布式爬取已启动") + + except json.JSONDecodeError: + return api_response(message="请求数据格式错误", status=400, error="Invalid JSON") + except Exception as e: + logger.error(f"启动分布式爬取失败: {e}") + return api_response(message="启动分布式爬取失败", status=500, error=str(e)) + + +@csrf_exempt +@require_http_methods(["GET"]) +def api_batch_status(request, batch_id): + """获取批次状态""" + try: + batch_status = distributed_crawler.get_batch_status(batch_id) + + if batch_status.get('status') == 'not_found': + return api_response(message="批次不存在", status=404, error="Batch not found") + + return api_response(data=batch_status, message="获取批次状态成功") + + except Exception as e: + logger.error(f"获取批次状态失败: {e}") + return api_response(message="获取批次状态失败", status=500, error=str(e)) + + +@csrf_exempt +@require_http_methods(["GET", "POST"]) +def api_cleanup_articles(request): + """清理旧文章""" + # 如果是GET请求,返回清理功能的描述信息 + if request.method == "GET": + response_data = { + 'description': '文章清理API', + 'method': 'POST', + 'parameters': { + 'days': '保留天数,默认30天' + }, + 'example': { + 'days': 30 + } + } + return api_response(data=response_data, message="API使用说明") + + try: + data = json.loads(request.body) + days = data.get('days', 30) + + # 启动清理任务 + task = cleanup_old_articles.delay(days) + + response_data = { + 'task_id': task.id, + 'days': days, + 'status': 'started' + } + + return api_response(data=response_data, message="清理任务已启动") + + except json.JSONDecodeError: + return api_response(message="请求数据格式错误", status=400, error="Invalid JSON") + except Exception as e: + logger.error(f"启动清理任务失败: {e}") + return api_response(message="启动清理任务失败", status=500, error=str(e)) + + +@csrf_exempt +@require_http_methods(["GET"]) +def api_stats(request): + """获取统计信息""" + try: + # 基础统计 + total_websites = Website.objects.count() + total_articles = Article.objects.count() + enabled_websites = Website.objects.filter(enabled=True).count() + + # 时间统计 + today = timezone.now().date() + week_ago = timezone.now() - timedelta(days=7) + month_ago = timezone.now() - timedelta(days=30) + + today_articles = Article.objects.filter(created_at__date=today).count() + week_articles = Article.objects.filter(created_at__gte=week_ago).count() + month_articles = Article.objects.filter(created_at__gte=month_ago).count() + + # 网站统计 + website_stats = [] + for website in Website.objects.all(): + website_stats.append({ + 'id': website.id, + 'name': website.name, + 'article_count': website.article_set.count(), + # 使用getattr安全访问last_crawl属性,如果不存在则返回None + 'last_crawl': website.last_crawl.isoformat() if getattr(website, 'last_crawl', None) else None, + }) + + # 分布式爬虫统计 + nodes = distributed_crawler.get_available_nodes() + batches = distributed_crawler.get_all_batches() + + response_data = { + 'overview': { + 'total_websites': total_websites, + 'enabled_websites': enabled_websites, + 'total_articles': total_articles, + 'today_articles': today_articles, + 'week_articles': week_articles, + 'month_articles': month_articles, + }, + 'websites': website_stats, + 'crawler': { + 'active_nodes': len(nodes), + 'total_batches': len(batches), + 'recent_batches': batches[:5], + } + } + + return api_response(data=response_data, message="获取统计信息成功") + + except Exception as e: + logger.error(f"获取统计信息失败: {e}") + return api_response(message="获取统计信息失败", status=500, error=str(e)) + + +@csrf_exempt +@require_http_methods(["POST"]) +def export_articles(request): + """导出文章""" + try: + data = json.loads(request.body) + article_ids = data.get('article_ids', []) + export_format = data.get('format', 'docx') # 默认改为docx格式 + + if not article_ids: + return api_response(message="请选择要导出的文章", status=400, error="No articles selected") + + # 获取文章数据 + articles = Article.objects.filter(id__in=article_ids).select_related('website') + + if not articles.exists(): + return api_response(message="未找到指定的文章", status=404, error="Articles not found") + + import os # 添加导入 + from django.conf import settings # 添加导入 + + if export_format == 'json': + # 导出为JSON格式 + articles_data = [] + for article in articles: + articles_data.append({ + 'id': article.id, + 'title': article.title, + 'url': article.url, + 'content': article.content, + 'created_at': article.created_at.isoformat(), + 'website': { + 'id': article.website.id, + 'name': article.website.name, + }, + 'media_files': article.media_files, + }) + + response = HttpResponse( + json.dumps(articles_data, ensure_ascii=False, indent=2), + content_type='application/json' + ) + response['Content-Disposition'] = 'attachment; filename="articles.json"' + return response + + elif export_format == 'csv': + # 导出为CSV格式 + output = io.StringIO() + writer = csv.writer(output) + writer.writerow(['ID', '标题', '网址', '内容', '创建时间', '网站']) + + for article in articles: + writer.writerow([ + article.id, + article.title, + article.url, + article.content[:1000] + '...' if len(article.content) > 1000 else article.content, + article.created_at.isoformat(), + article.website.name + ]) + + response = HttpResponse(output.getvalue(), content_type='text/csv') + response['Content-Disposition'] = 'attachment; filename="articles.csv"' + return response + + elif export_format == 'docx': + # 导出为Word格式,每个文章一个文件夹 + zip_buffer = io.BytesIO() + with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file: + for article in articles: + # 创建文章文件夹名称 + safe_title = "".join(c for c in article.title if c.isalnum() or c in (' ','_','-')).rstrip() + folder_name = f"article_{article.id}_{safe_title}"[:50] + + # 创建Word文档 + doc = Document() + doc.add_heading(article.title, 0) + + # 添加文章信息 + doc.add_paragraph(f"网站: {article.website.name}") + doc.add_paragraph(f"网址: {article.url}") + doc.add_paragraph(f"发布时间: {article.pub_date.isoformat() if article.pub_date else 'N/A'}") + doc.add_paragraph(f"创建时间: {article.created_at.isoformat()}") + + # 添加内容标题 + doc.add_heading('内容:', level=1) + + # 处理HTML内容 + content_text = BeautifulSoup(article.content, 'html.parser').get_text() + doc.add_paragraph(content_text) + + # 将文档保存到内存中 + doc_buffer = io.BytesIO() + doc.save(doc_buffer) + doc_buffer.seek(0) + + # 添加到ZIP文件 + zip_file.writestr(f"{folder_name}/article.docx", doc_buffer.getvalue()) + + # 添加媒体文件(如果存在) + if article.media_files: + for media in article.media_files: + try: + # 如果是本地文件路径 + if not media.startswith('http'): + media_path = os.path.join(settings.MEDIA_ROOT, media.lstrip('/')) + if os.path.exists(media_path): + zip_file.write(media_path, f"{folder_name}/media/{os.path.basename(media_path)}") + # 如果是URL格式的媒体文件 + else: + import requests + from io import BytesIO + + response = requests.get(media, timeout=10) + if response.status_code == 200: + image_stream = BytesIO(response.content) + media_filename = f"{folder_name}/media/{os.path.basename(media)}" + zip_file.writestr(media_filename, image_stream.getvalue()) + except Exception: + # 忽略无法添加的媒体文件 + pass + + response = HttpResponse(zip_buffer.getvalue(), content_type='application/zip') + response['Content-Disposition'] = 'attachment; filename="articles.zip"' + return response + + elif export_format == 'zip': + # 导出为ZIP包,每个文章一个文件夹 + zip_buffer = io.BytesIO() + with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file: + for article in articles: + # 创建文章文件夹名称 + safe_title = "".join(c for c in article.title if c.isalnum() or c in (' ','_','-')).rstrip() + folder_name = f"article_{article.id}_{safe_title}"[:50] + + # 创建Word文档 + doc = Document() + doc.add_heading(article.title, 0) + + # 添加文章信息 + doc.add_paragraph(f"网站: {article.website.name}") + doc.add_paragraph(f"网址: {article.url}") + doc.add_paragraph(f"发布时间: {article.pub_date.isoformat() if article.pub_date else 'N/A'}") + doc.add_paragraph(f"创建时间: {article.created_at.isoformat()}") + + # 添加内容标题 + doc.add_heading('内容:', level=1) + + # 处理HTML内容 + content_text = BeautifulSoup(article.content, 'html.parser').get_text() + doc.add_paragraph(content_text) + + # 将文档保存到内存中 + doc_buffer = io.BytesIO() + doc.save(doc_buffer) + doc_buffer.seek(0) + + # 添加到ZIP文件 + zip_file.writestr(f"{folder_name}/article.docx", doc_buffer.getvalue()) + + # 添加媒体文件(如果存在) + if article.media_files: + for media in article.media_files: + try: + # 如果是本地文件路径 + if not media.startswith('http'): + media_path = os.path.join(settings.MEDIA_ROOT, media.lstrip('/')) + if os.path.exists(media_path): + zip_file.write(media_path, f"{folder_name}/media/{os.path.basename(media_path)}") + # 如果是URL格式的媒体文件 + else: + import requests + from io import BytesIO + + response = requests.get(media, timeout=10) + if response.status_code == 200: + image_stream = BytesIO(response.content) + media_filename = f"{folder_name}/media/{os.path.basename(media)}" + zip_file.writestr(media_filename, image_stream.getvalue()) + except Exception: + # 忽略无法添加的媒体文件 + pass + + response = HttpResponse(zip_buffer.getvalue(), content_type='application/zip') + response['Content-Disposition'] = 'attachment; filename="articles.zip"' + return response + + else: + return api_response(message="不支持的导出格式", status=400, error="Unsupported format") + + except json.JSONDecodeError: + return api_response(message="请求数据格式错误", status=400, error="Invalid JSON") + except Exception as e: + logger.error(f"导出文章失败: {e}") + return api_response(message="导出文章失败", status=500, error=str(e)) \ No newline at end of file diff --git a/core/apps.py b/core/apps.py index 8115ae6..a854f42 100644 --- a/core/apps.py +++ b/core/apps.py @@ -4,3 +4,8 @@ from django.apps import AppConfig class CoreConfig(AppConfig): default_auto_field = 'django.db.models.BigAutoField' name = 'core' + + def ready(self): + """应用启动时执行""" + # 导入Admin扩展 + import core.admin_extended diff --git a/core/management/commands/crawl_all_media.py b/core/management/commands/crawl_all_media.py index 3c37ae7..3b9db2f 100644 --- a/core/management/commands/crawl_all_media.py +++ b/core/management/commands/crawl_all_media.py @@ -9,7 +9,7 @@ class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument('--media', type=str, help='指定要爬取的媒体,用逗号分隔') parser.add_argument('--platform', type=str, default='all', - help='指定平台类型: all(全部), web(网站), mobile(移动端)') + help='指定平台类型: all(全部), web(网站)') def handle(self, *args, **options): media_list = options['media'] diff --git a/core/management/commands/crawl_cctv.py b/core/management/commands/crawl_cctv.py index 10d1f34..9ba017b 100644 --- a/core/management/commands/crawl_cctv.py +++ b/core/management/commands/crawl_cctv.py @@ -9,7 +9,7 @@ class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument('--platform', type=str, default='all', - choices=['cctv', 'cctvnews', 'mobile', 'all'], + choices=['cctv', 'cctvnews', 'all'], help='选择爬取平台: cctv(央视网), cctvnews(央视新闻), all(全部)') def handle(self, *args, **options): diff --git a/core/management/commands/crawl_china.py b/core/management/commands/crawl_china.py index 4b330ac..0e8b02c 100644 --- a/core/management/commands/crawl_china.py +++ b/core/management/commands/crawl_china.py @@ -3,13 +3,12 @@ from core.models import Website from core.utils import full_site_crawler -# jimmy.fang-20250815: 因URL问题,移除中国网-省份 class Command(BaseCommand): help = "全站递归爬取 中国网主网及中国网一省份,不转发二级子网站" def add_arguments(self, parser): parser.add_argument('--platform', type=str, default='all', - choices=['china', 'province', 'all'], + choices=['china', 'all'], help='选择爬取平台: china(中国网主网), province(中国网一省份), all(全部)') def handle(self, *args, **options): @@ -23,12 +22,7 @@ class Command(BaseCommand): 'start_url': 'http://www.china.com.cn', 'article_selector': 'a' }, - # 'province': { - # 'name': '中国网一省份', - # 'base_url': 'http://www.china.com.cn', - # 'start_url': 'http://www.china.com.cn/province', - # 'article_selector': 'a' - # } + } if platform == 'all': diff --git a/core/management/commands/crawl_chinanews.py b/core/management/commands/crawl_chinanews.py index 90b21d3..9538c71 100644 --- a/core/management/commands/crawl_chinanews.py +++ b/core/management/commands/crawl_chinanews.py @@ -8,7 +8,7 @@ class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument('--platform', type=str, default='all', - choices=['chinanews', 'mobile', 'all'], + choices=['chinanews', 'all'], help='选择爬取平台: chinanews(中国新闻社), all(全部)') def handle(self, *args, **options): diff --git a/core/management/commands/crawl_cngov.py b/core/management/commands/crawl_cngov.py index 4dbcb27..fcfff66 100644 --- a/core/management/commands/crawl_cngov.py +++ b/core/management/commands/crawl_cngov.py @@ -50,4 +50,4 @@ class Command(BaseCommand): full_site_crawler(platform_config['start_url'], website, max_pages=500) self.stdout.write(f"完成爬取: {platform_config['name']}") - self.stdout.write(self.style.SUCCESS("中国政府网所有平台爬取完成")) \ No newline at end of file + self.stdout.write(self.style.SUCCESS("中国政府网所有平台爬取完成")) diff --git a/core/management/commands/crawl_dongfangyancao.py b/core/management/commands/crawl_dongfangyancao.py index b5561a9..df657e7 100644 --- a/core/management/commands/crawl_dongfangyancao.py +++ b/core/management/commands/crawl_dongfangyancao.py @@ -50,4 +50,4 @@ class Command(BaseCommand): full_site_crawler(platform_config['start_url'], website, max_pages=500) self.stdout.write(f"完成爬取: {platform_config['name']}") - self.stdout.write(self.style.SUCCESS("东方烟草报所有平台爬取完成")) \ No newline at end of file + self.stdout.write(self.style.SUCCESS("东方烟草报所有平台爬取完成")) diff --git a/core/management/commands/crawl_fzrb.py b/core/management/commands/crawl_fzrb.py index bc7f067..4328133 100644 --- a/core/management/commands/crawl_fzrb.py +++ b/core/management/commands/crawl_fzrb.py @@ -8,7 +8,7 @@ class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument('--platform', type=str, default='all', - choices=['fzrb', 'mobile', 'all'], + choices=['fzrb', 'all'], help='选择爬取平台: fzrb(法治日报), all(全部)') def handle(self, *args, **options): diff --git a/core/management/commands/crawl_gmrb.py b/core/management/commands/crawl_gmrb.py index 4a84d92..88e25cd 100644 --- a/core/management/commands/crawl_gmrb.py +++ b/core/management/commands/crawl_gmrb.py @@ -2,13 +2,14 @@ from django.core.management.base import BaseCommand from core.models import Website from core.utils import full_site_crawler -# jimmy.fang-20250815: 光明日报反爬,会被阻挡 + +# jimmy.fang-20250815: 取消对光明日报的支持,光明日报反爬,被阻挡 class Command(BaseCommand): help = "全站递归爬取 光明日报及其子网站、客户端、新媒体平台" def add_arguments(self, parser): parser.add_argument('--platform', type=str, default='all', - choices=['gmrb', 'mobile', 'all'], + choices=['gmrb', 'all'], help='选择爬取平台: gmrb(光明日报), all(全部)') def handle(self, *args, **options): diff --git a/core/management/commands/crawl_grrb.py b/core/management/commands/crawl_grrb.py index ab44905..9bcb96b 100644 --- a/core/management/commands/crawl_grrb.py +++ b/core/management/commands/crawl_grrb.py @@ -8,7 +8,7 @@ class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument('--platform', type=str, default='all', - choices=['grrb', 'mobile', 'all'], + choices=['grrb', 'all'], help='选择爬取平台: grrb(工人日报), all(全部)') def handle(self, *args, **options): diff --git a/core/management/commands/crawl_jjrb.py b/core/management/commands/crawl_jjrb.py index b5fa168..be11065 100644 --- a/core/management/commands/crawl_jjrb.py +++ b/core/management/commands/crawl_jjrb.py @@ -8,7 +8,7 @@ class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument('--platform', type=str, default='all', - choices=['jjrb', 'mobile', 'all'], + choices=['jjrb', 'all'], help='选择爬取平台: jjrb(经济日报), all(全部)') def handle(self, *args, **options): diff --git a/core/management/commands/crawl_kjrb.py b/core/management/commands/crawl_kjrb.py index f030fbd..8cc60a0 100644 --- a/core/management/commands/crawl_kjrb.py +++ b/core/management/commands/crawl_kjrb.py @@ -9,7 +9,7 @@ class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument('--platform', type=str, default='all', - choices=['kjrb', 'mobile', 'all'], + choices=['kjrb', 'all'], help='选择爬取平台: kjrb(科技日报), all(全部)') def handle(self, *args, **options): diff --git a/core/management/commands/crawl_nmrb.py b/core/management/commands/crawl_nmrb.py index f62d620..db611e8 100644 --- a/core/management/commands/crawl_nmrb.py +++ b/core/management/commands/crawl_nmrb.py @@ -8,7 +8,7 @@ class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument('--platform', type=str, default='all', - choices=['nmrb', 'mobile', 'all'], + choices=['nmrb', 'all'], help='选择爬取平台: nmrb(农民日报), all(全部)') def handle(self, *args, **options): diff --git a/core/management/commands/crawl_pla.py b/core/management/commands/crawl_pla.py index 8041cdc..caeb9e6 100644 --- a/core/management/commands/crawl_pla.py +++ b/core/management/commands/crawl_pla.py @@ -8,8 +8,8 @@ class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument('--platform', type=str, default='all', - choices=['pla', 'mobile', 'all'], - help='选择爬取平台: pla(解放军报), mobile(移动端), all(全部)') + choices=['pla', 'all'], + help='选择爬取平台: pla(解放军报), all(全部)') def handle(self, *args, **options): platform = options['platform'] diff --git a/core/management/commands/crawl_rmzxb.py b/core/management/commands/crawl_rmzxb.py index 9f49fdc..53244aa 100644 --- a/core/management/commands/crawl_rmzxb.py +++ b/core/management/commands/crawl_rmzxb.py @@ -8,8 +8,8 @@ class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument('--platform', type=str, default='all', - choices=['rmzxb', 'mobile', 'all'], - help='选择爬取平台: rmzxb(人民政协网), mobile(移动端), all(全部)') + choices=['rmzxb', 'all'], + help='选择爬取平台: rmzxb(人民政协网), all(全部)') def handle(self, *args, **options): platform = options['platform'] diff --git a/core/management/commands/crawl_xinhua.py b/core/management/commands/crawl_xinhua.py index 729ade0..6bb48a9 100644 --- a/core/management/commands/crawl_xinhua.py +++ b/core/management/commands/crawl_xinhua.py @@ -8,8 +8,8 @@ class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument('--platform', type=str, default='all', - choices=['news', 'xinhuanet', 'mobile', 'all'], - help='选择爬取平台: news(新华网), xinhuanet(新华网主站), mobile(移动端), all(全部)') + choices=['news', 'all'], + help='选择爬取平台: news(新华网), all(全部)') def handle(self, *args, **options): platform = options['platform'] @@ -22,18 +22,7 @@ class Command(BaseCommand): 'start_url': 'https://www.news.cn', 'article_selector': 'a' }, - 'xinhuanet': { - 'name': '新华网主站', - 'base_url': 'https://www.xinhuanet.com', - 'start_url': 'https://www.xinhuanet.com', - 'article_selector': 'a' - }, - 'mobile': { - 'name': '新华社移动端', - 'base_url': 'https://m.xinhuanet.com', - 'start_url': 'https://m.xinhuanet.com', - 'article_selector': 'a' - } + } if platform == 'all': diff --git a/core/management/commands/crawl_xuexi.py b/core/management/commands/crawl_xuexi.py index 5486913..a8a9212 100644 --- a/core/management/commands/crawl_xuexi.py +++ b/core/management/commands/crawl_xuexi.py @@ -8,8 +8,8 @@ class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument('--platform', type=str, default='all', - choices=['xuexi', 'central', 'provincial', 'all'], - help='选择爬取平台: xuexi(学习强国主站), central(中央媒体), provincial(省级平台), all(全部)') + choices=['xuexi', 'all'], + help='选择爬取平台: xuexi(学习强国主站), all(全部)') def handle(self, *args, **options): platform = options['platform'] @@ -22,18 +22,6 @@ class Command(BaseCommand): 'start_url': 'https://www.xuexi.cn', 'article_selector': 'a' }, - 'central': { - 'name': '学习强国中央媒体', - 'base_url': 'https://www.xuexi.cn', - 'start_url': 'https://www.xuexi.cn/central', - 'article_selector': 'a' - }, - 'provincial': { - 'name': '学习强国省级平台', - 'base_url': 'https://www.xuexi.cn', - 'start_url': 'https://www.xuexi.cn/provincial', - 'article_selector': 'a' - } } if platform == 'all': diff --git a/core/management/commands/crawl_xxsb.py b/core/management/commands/crawl_xxsb.py index c73ca4e..c36aaa1 100644 --- a/core/management/commands/crawl_xxsb.py +++ b/core/management/commands/crawl_xxsb.py @@ -8,8 +8,8 @@ class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument('--platform', type=str, default='all', - choices=['xxsb', 'mobile', 'all'], - help='选择爬取平台: xxsb(学习时报), mobile(移动端), all(全部)') + choices=['xxsb', 'all'], + help='选择爬取平台: xxsb(学习时报),all(全部)') def handle(self, *args, **options): platform = options['platform'] @@ -22,12 +22,6 @@ class Command(BaseCommand): 'start_url': 'http://www.studytimes.cn', 'article_selector': 'a' }, - 'mobile': { - 'name': '学习时报移动端', - 'base_url': 'http://m.studytimes.cn', - 'start_url': 'http://m.studytimes.cn', - 'article_selector': 'a' - } } if platform == 'all': diff --git a/core/management/commands/crawl_zgfnb.py b/core/management/commands/crawl_zgfnb.py index f33bb9f..4bd3624 100644 --- a/core/management/commands/crawl_zgfnb.py +++ b/core/management/commands/crawl_zgfnb.py @@ -8,8 +8,8 @@ class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument('--platform', type=str, default='all', - choices=['zgfnb', 'mobile', 'all'], - help='选择爬取平台: zgfnb(中国妇女报), mobile(移动端), all(全部)') + choices=['zgfnb', 'all'], + help='选择爬取平台: zgfnb(中国妇女报), all(全部)') def handle(self, *args, **options): platform = options['platform'] @@ -22,12 +22,7 @@ class Command(BaseCommand): 'start_url': 'http://www.cnwomen.com.cn', 'article_selector': 'a' }, - 'mobile': { - 'name': '中国妇女报移动端', - 'base_url': 'http://m.cnwomen.com.cn', - 'start_url': 'http://m.cnwomen.com.cn', - 'article_selector': 'a' - } + } if platform == 'all': diff --git a/core/management/commands/crawl_zgjwjc.py b/core/management/commands/crawl_zgjwjc.py index 7c509b2..21c35bd 100644 --- a/core/management/commands/crawl_zgjwjc.py +++ b/core/management/commands/crawl_zgjwjc.py @@ -8,8 +8,8 @@ class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument('--platform', type=str, default='all', - choices=['zgjwjc', 'mobile', 'all'], - help='选择爬取平台: zgjwjc(中国纪检监察报), mobile(移动端), all(全部)') + choices=['zgjwjc', 'all'], + help='选择爬取平台: zgjwjc(中国纪检监察报),all(全部)') def handle(self, *args, **options): platform = options['platform'] @@ -18,16 +18,10 @@ class Command(BaseCommand): platforms = { 'zgjwjc': { 'name': '中国纪检监察报', - 'base_url': 'http://www.jjjcb.cn', - 'start_url': 'http://www.jjjcb.cn', + 'base_url': 'https://jjjcb.ccdi.gov.cn', + 'start_url': 'https://jjjcb.ccdi.gov.cn', 'article_selector': 'a' }, - 'mobile': { - 'name': '中国纪检监察报移动端', - 'base_url': 'http://m.jjjcb.cn', - 'start_url': 'http://m.jjjcb.cn', - 'article_selector': 'a' - } } if platform == 'all': diff --git a/core/management/commands/crawl_zgqnb.py b/core/management/commands/crawl_zgqnb.py index fcc2081..e617141 100644 --- a/core/management/commands/crawl_zgqnb.py +++ b/core/management/commands/crawl_zgqnb.py @@ -8,8 +8,8 @@ class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument('--platform', type=str, default='all', - choices=['zgqnb', 'mobile', 'all'], - help='选择爬取平台: zgqnb(中国青年报), mobile(移动端), all(全部)') + choices=['zgqnb', 'all'], + help='选择爬取平台: zgqnb(中国青年报), all(全部)') def handle(self, *args, **options): platform = options['platform'] @@ -22,12 +22,7 @@ class Command(BaseCommand): 'start_url': 'https://www.cyol.com', 'article_selector': 'a' }, - 'mobile': { - 'name': '中国青年报移动端', - 'base_url': 'https://m.cyol.com', - 'start_url': 'https://m.cyol.com', - 'article_selector': 'a' - } + } if platform == 'all': diff --git a/core/management/commands/export_articles.py b/core/management/commands/export_articles.py index bf683d0..dbe1db6 100644 --- a/core/management/commands/export_articles.py +++ b/core/management/commands/export_articles.py @@ -6,6 +6,10 @@ import os from django.conf import settings import zipfile from django.utils import timezone +from bs4 import BeautifulSoup +# 添加python-docx库支持 +import io +from docx import Document class Command(BaseCommand): @@ -119,201 +123,100 @@ class Command(BaseCommand): # 添加Word格式导出方法 def export_as_word(self, articles_data, output_path): try: - from docx import Document - from docx.shared import Inches - except ImportError: - self.stdout.write(self.style.ERROR('缺少python-docx库,请安装: pip install python-docx')) - return - - # 创建Word文档 - doc = Document() - doc.add_heading('文章导出', 0) - - for article_data in articles_data: - # 添加文章标题 - doc.add_heading(article_data['title'], level=1) - - # 添加文章元数据 - doc.add_paragraph(f"网站: {article_data['website']}") - doc.add_paragraph(f"URL: {article_data['url']}") - doc.add_paragraph(f"发布时间: {article_data['pub_date']}") - doc.add_paragraph(f"创建时间: {article_data['created_at']}") - - # 添加文章内容 - doc.add_heading('内容', level=2) - # 简单处理HTML内容,移除标签 - from bs4 import BeautifulSoup - soup = BeautifulSoup(article_data['content'], 'html.parser') - - # 处理内容中的图片 - for img in soup.find_all('img'): - src = img.get('src', '') - if src: - # 尝试添加图片到文档 - try: - import os - from django.conf import settings - import requests - from io import BytesIO - - # 构建完整的图片路径 - if src.startswith('http'): - # 网络图片 - response = requests.get(src, timeout=10) - image_stream = BytesIO(response.content) - doc.add_picture(image_stream, width=Inches(4.0)) - else: - # 本地图片 - full_path = os.path.join(settings.MEDIA_ROOT, src.lstrip('/')) - if os.path.exists(full_path): - doc.add_picture(full_path, width=Inches(4.0)) - except Exception as e: - # 如果添加图片失败,添加图片URL作为文本 - doc.add_paragraph(f"[图片: {src}]") - - # 移除原始img标签 - img.decompose() - - content_text = soup.get_text() - doc.add_paragraph(content_text) - - # 添加媒体文件信息 - if article_data['media_files']: - doc.add_heading('媒体文件', level=2) - for media_file in article_data['media_files']: - try: - import os - from django.conf import settings - from io import BytesIO - import requests - - full_path = os.path.join(settings.MEDIA_ROOT, media_file) - if os.path.exists(full_path): - # 添加图片到文档 - doc.add_picture(full_path, width=Inches(4.0)) - else: - # 如果是URL格式的媒体文件 - if media_file.startswith('http'): - response = requests.get(media_file, timeout=10) - image_stream = BytesIO(response.content) - doc.add_picture(image_stream, width=Inches(4.0)) - else: - doc.add_paragraph(media_file) - except Exception as e: - doc.add_paragraph(media_file) - - # 添加分页符 - doc.add_page_break() - - # 保存文档 - doc.save(output_path) + # 创建一个新的Word文档 + document = Document() + document.add_heading('文章导出', 0) + + for article_data in articles_data: + # 添加文章标题 + document.add_heading(article_data['title'], level=1) + + # 添加文章信息 + document.add_paragraph(f"网站: {article_data['website']}") + document.add_paragraph(f"URL: {article_data['url']}") + document.add_paragraph(f"发布时间: {article_data['pub_date']}") + document.add_paragraph(f"创建时间: {article_data['created_at']}") + + # 添加内容标题 + document.add_heading('内容:', level=2) + + # 处理HTML内容,移除标签 + soup = BeautifulSoup(article_data['content'], 'html.parser') + content_text = soup.get_text() + document.add_paragraph(content_text) + + # 添加分页符分隔文章 + document.add_page_break() + + # 保存文档 + document.save(output_path) + self.stdout.write(self.style.SUCCESS(f'成功导出为Word格式: {output_path}')) + except Exception as e: + self.stdout.write(self.style.ERROR(f'导出Word格式失败: {e}')) def export_with_media(self, articles_data, media_files, output_path, format_type): # 创建ZIP文件 with zipfile.ZipFile(output_path, 'w') as zipf: - # 添加文章数据文件 - data_filename = f'articles.{format_type}' - if format_type == 'json': - json_data = json.dumps(articles_data, ensure_ascii=False, indent=2) - zipf.writestr(data_filename, json_data) - elif format_type == 'csv': - # 创建CSV内容 - if articles_data: - import io - csv_buffer = io.StringIO() - fieldnames = ['id', 'title', 'website', 'url', 'pub_date', 'content', 'created_at', 'media_files'] - writer = csv.DictWriter(csv_buffer, fieldnames=fieldnames) - writer.writeheader() - for article_data in articles_data: - article_data['media_files'] = ';'.join(article_data['media_files']) if article_data[ - 'media_files'] else '' - writer.writerow(article_data) - zipf.writestr(data_filename, csv_buffer.getvalue()) - elif format_type == 'docx': - # 创建Word文档并保存到ZIP - try: - from docx import Document - from docx.shared import Inches - from io import BytesIO - - doc = Document() - doc.add_heading('文章导出', 0) - - for article_data in articles_data: - doc.add_heading(article_data['title'], level=1) + # 为每篇文章创建独立的文件夹 + for article_data in articles_data: + article_folder = f"article_{article_data['id']}_{article_data['title']}" + # 限制文件夹名称长度并移除非法字符 + article_folder = article_folder[:50].rstrip() + article_folder = "".join(c for c in article_folder if c.isalnum() or c in (' ','_','-')).rstrip() + + # 添加文章数据文件 + if format_type == 'docx': + # 创建Word文档并保存到ZIP + data_filename = f'{article_folder}/article.docx' + try: + # 创建文章信息Word文档 + doc = Document() + doc.add_heading(article_data['title'], 0) + + # 添加文章信息 doc.add_paragraph(f"网站: {article_data['website']}") doc.add_paragraph(f"URL: {article_data['url']}") doc.add_paragraph(f"发布时间: {article_data['pub_date']}") doc.add_paragraph(f"创建时间: {article_data['created_at']}") - - doc.add_heading('内容', level=2) - from bs4 import BeautifulSoup + + # 添加内容标题 + doc.add_heading('内容:', level=1) + + # 处理HTML内容 soup = BeautifulSoup(article_data['content'], 'html.parser') - - # 处理内容中的图片 - for img in soup.find_all('img'): - src = img.get('src', '') - if src: - # 尝试添加图片到文档 - try: - import os - from django.conf import settings - import requests - - # 构建完整的图片路径 - if src.startswith('http'): - # 网络图片 - response = requests.get(src, timeout=10) - image_stream = BytesIO(response.content) - doc.add_picture(image_stream, width=Inches(4.0)) - else: - # 本地图片 - full_path = os.path.join(settings.MEDIA_ROOT, src.lstrip('/')) - if os.path.exists(full_path): - doc.add_picture(full_path, width=Inches(4.0)) - except Exception as e: - # 如果添加图片失败,添加图片URL作为文本 - doc.add_paragraph(f"[图片: {src}]") - - # 移除原始img标签 - img.decompose() - content_text = soup.get_text() doc.add_paragraph(content_text) - - if article_data['media_files']: - doc.add_heading('媒体文件', level=2) - for media_file in article_data['media_files']: - try: - import os - from django.conf import settings - - full_path = os.path.join(settings.MEDIA_ROOT, media_file) - if os.path.exists(full_path): - # 添加图片到文档 - doc.add_picture(full_path, width=Inches(4.0)) - else: - # 如果是URL格式的媒体文件 - if media_file.startswith('http'): - response = requests.get(media_file, timeout=10) - image_stream = BytesIO(response.content) - doc.add_picture(image_stream, width=Inches(4.0)) - else: - doc.add_paragraph(media_file) - except Exception as e: - doc.add_paragraph(media_file) - - doc.add_page_break() - - # 将文档保存到内存中再写入ZIP - doc_buffer = BytesIO() - doc.save(doc_buffer) - doc_buffer.seek(0) - zipf.writestr(data_filename, doc_buffer.read()) - except ImportError: - zipf.writestr(data_filename, "错误:缺少python-docx库,无法生成Word文档") - - # 添加媒体文件 - for media_path in media_files: - arcname = os.path.join('media', os.path.relpath(media_path, settings.MEDIA_ROOT)) - zipf.write(media_path, arcname) + + # 将文档保存到内存中 + doc_buffer = io.BytesIO() + doc.save(doc_buffer) + doc_buffer.seek(0) + + # 将文档添加到ZIP文件 + zipf.writestr(data_filename, doc_buffer.getvalue()) + except Exception as e: + error_msg = f"错误:无法生成文章Word文档 - {str(e)}" + zipf.writestr(data_filename, error_msg) + + # 添加媒体文件到文章的media子文件夹 + if article_data['media_files']: + for media_file in article_data['media_files']: + try: + full_path = os.path.join(settings.MEDIA_ROOT, media_file) + if os.path.exists(full_path): + # 添加媒体文件到ZIP中的media子文件夹 + media_filename = f"{article_folder}/media/{os.path.basename(media_file)}" + zipf.write(full_path, media_filename) + else: + # 如果是URL格式的媒体文件 + if media_file.startswith('http'): + import requests + from io import BytesIO + + response = requests.get(media_file, timeout=10) + image_stream = BytesIO(response.content) + media_filename = f"{article_folder}/media/{os.path.basename(media_file)}" + zipf.writestr(media_filename, image_stream.getvalue()) + except Exception as e: + # 错误处理,跳过无法添加的文件 + pass \ No newline at end of file diff --git a/core/templates/admin/core/article/change_list.html b/core/templates/admin/core/article/change_list.html index 7fe0384..e0ae9e7 100644 --- a/core/templates/admin/core/article/change_list.html +++ b/core/templates/admin/core/article/change_list.html @@ -3,6 +3,7 @@ {% block object-tools %} {{ block.super }} + {% endblock %} \ No newline at end of file diff --git a/core/templates/admin/crawler_status.html b/core/templates/admin/crawler_status.html new file mode 100644 index 0000000..088e619 --- /dev/null +++ b/core/templates/admin/crawler_status.html @@ -0,0 +1,304 @@ +{% extends "admin/base_site.html" %} +{% load static %} + +{% block title %}爬虫状态 - {{ site_title|default:_('Django site admin') }}{% endblock %} + +{% block extrastyle %} + +{% endblock %} + +{% block content %} + + + 爬虫状态监控 + 刷新 + + + + + + {{ task_stats.total_nodes }} + 活跃节点 + + + {{ task_stats.active_tasks }} + 运行中任务 + + + {{ task_stats.total_batches }} + 总批次 + + + {{ nodes|length }} + 在线节点 + + + + + + 爬虫节点状态 + {% if nodes %} + {% for node in nodes %} + + + {{ node.node_id }} + {{ node.status }} + + + + 活跃任务: + {{ node.active_tasks }} + + + 完成任务: + {{ node.completed_tasks }} + + + 失败任务: + {{ node.failed_tasks }} + + + 最后心跳: + + {% if node.last_heartbeat %} + {{ node.last_heartbeat|date:"H:i:s" }} + {% else %} + 未知 + {% endif %} + + + + + {% endfor %} + {% else %} + + 暂无活跃的爬虫节点 + + {% endif %} + + + + + 最近批次 + {% if batches %} + {% for batch in batches %} + + + {{ batch.batch_id }} + + {% if batch.status == 'running' %} + 运行中 + {% elif batch.status == 'completed' %} + 已完成 + {% elif batch.status == 'failed' %} + 失败 + {% else %} + {{ batch.status }} + {% endif %} + + + + + 总任务: + {{ batch.total_tasks }} + + + 已完成: + {{ batch.completed_tasks }} + + + 失败: + {{ batch.failed_tasks }} + + + 进度: + {{ batch.progress|floatformat:1 }}% + + + {% if batch.status == 'running' %} + + + + {% endif %} + + {% endfor %} + {% else %} + + 暂无批次记录 + + {% endif %} + + + + +{% endblock %} diff --git a/core/templates/core/article_detail.html b/core/templates/core/article_detail.html index e101736..24a7cef 100644 --- a/core/templates/core/article_detail.html +++ b/core/templates/core/article_detail.html @@ -40,7 +40,16 @@ margin-top: 20px; } - .content img { + /* 优化:确保图片和视频不会超出容器显示 */ + .content img, .content video { + max-width: 100%; + height: auto; + display: block; + margin: 10px 0; + } + + /* 优化:确保iframe也不会超出容器显示 */ + .content iframe { max-width: 100%; height: auto; } @@ -61,7 +70,7 @@ body { padding: 10px; } - + .container { padding: 15px; } @@ -69,21 +78,21 @@ - - « 返回文章列表 + + « 返回文章列表 - {{ article.title }} + {{ article.title }} - - 网站: {{ article.website.name }} | - 发布时间: {{ article.pub_date|date:"Y-m-d H:i" }} | - 创建时间: {{ article.created_at|date:"Y-m-d H:i" }} | - 源网址: {{ article.url }} - - - - {{ article.content|safe }} - + + 网站: {{ article.website.name }} | + 发布时间: {{ article.pub_date|date:"Y-m-d H:i" }} | + 创建时间: {{ article.created_at|date:"Y-m-d H:i" }} | + 源网址: {{ article.url }} + + + {{ article.content|safe }} + +