-
+ {% for article in page_obj %}
+
- + + {{ article.title }} + + + {% empty %} +
- 暂无文章 + {% endfor %} +
-
- {% for article in page_obj %}
-
- - {{ article.title }} - - - {% empty %} -
- 暂无文章 - {% endfor %} -
diff --git a/core/admin.py b/core/admin.py index bfdd200..b44de9e 100644 --- a/core/admin.py +++ b/core/admin.py @@ -8,24 +8,53 @@ from django.http import HttpResponseRedirect import csv from django.http import HttpResponse import json +# 添加视图函数需要的导入 +from django.shortcuts import render, redirect +from django.urls import path +from django.contrib import admin +from django.http import JsonResponse +from django.views.decorators.http import require_http_methods +from django.core.management import call_command +import threading +import uuid +from django.utils import timezone # 创建自定义管理站点 -class NewsCnAdminSite(AdminSite): - site_header = "新华网管理后台" - site_title = "新华网管理" - index_title = "新华网内容管理" - - -class DongfangyancaoAdminSite(AdminSite): - site_header = "东方烟草报管理后台" - site_title = "东方烟草报管理" - index_title = "东方烟草报内容管理" - # 实例化管理站点 -news_cn_admin = NewsCnAdminSite(name='news_cn_admin') -dongfangyancao_admin = DongfangyancaoAdminSite(name='dongfangyancao_admin') + +# 添加运行爬虫的视图函数 +def run_crawler_view(request): + """ + 管理后台运行爬虫的视图 + """ + if request.method == 'POST': + website_name = request.POST.get('website_name') + if not website_name: + messages.error(request, '请选择要爬取的网站') + return redirect('admin:core_article_changelist') + + try: + # 根据网站名称确定要执行的爬虫命令 + if website_name == 'crawl_xinhua': + crawler_name = 'crawl_xinhua' + elif website_name == 'crawl_dongfangyancao': + crawler_name = 'crawl_dongfangyancao' + elif website_name == 'crawl_articles': + crawler_name = 'crawl_articles' + else: + # 对于其他网站,使用通用爬虫命令 + crawler_name = 'crawl_articles' + + # 运行爬虫命令,不传递website_name作为参数 + call_command(crawler_name) + + messages.success(request, f'成功执行爬虫: {crawler_name}') + except Exception as e: + messages.error(request, f'执行爬虫失败: {str(e)}') + + return redirect('admin:core_article_changelist') @admin.register(Website) @@ -39,22 +68,16 @@ class ArticleAdmin(admin.ModelAdmin): list_display = ('title', 'website', 'pub_date') search_fields = ('title', 'content') # 添加动作选项 - actions = ['delete_selected_articles', 'delete_dongfangyancao_articles', 'export_as_csv', 'export_as_json', - 'export_as_word'] + actions = ['delete_selected_articles', 'export_as_csv', 'export_as_json', + 'export_as_word', 'export_with_media'] - def delete_dongfangyancao_articles(self, request, queryset): - """一键删除东方烟草报的所有文章""" - # 获取东方烟草报网站对象 - try: - dongfangyancao_website = Website.objects.get(name='东方烟草报') - # 删除所有东方烟草报的文章 - deleted_count = Article.objects.filter(website=dongfangyancao_website).delete()[0] - self.message_user(request, f"成功删除 {deleted_count} 篇东方烟草报文章", messages.SUCCESS) - except Website.DoesNotExist: - self.message_user(request, "未找到东方烟草报网站配置", messages.ERROR) - - # 设置动作的显示名称 - delete_dongfangyancao_articles.short_description = "删除所有东方烟草报文章" + # 重写get_urls方法,添加自定义URL + def get_urls(self): + urls = super().get_urls() + custom_urls = [ + path('run-crawler/', self.admin_site.admin_view(run_crawler_view), name='run_crawler'), + ] + return custom_urls + urls def export_as_csv(self, request, queryset): """导出选中的文章为CSV格式""" @@ -205,6 +228,155 @@ class ArticleAdmin(admin.ModelAdmin): export_as_word.short_description = "导出选中文章为Word格式" + def export_with_media(self, request, queryset): + """导出选中的文章及媒体文件为ZIP包""" + try: + from docx import Document + from io import BytesIO + from docx.shared import Inches + import zipfile + except ImportError: + self.message_user(request, "缺少必要库,请安装: pip install python-docx", messages.ERROR) + return + + # 创建内存中的ZIP文件 + zip_buffer = BytesIO() + + with zipfile.ZipFile(zip_buffer, 'w') as zip_file: + for article in queryset: + # 为每篇文章创建单独的文件夹 + article_folder = f"article_{article.id}_{article.title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('*', '_').replace('?', '_').replace('"', '_').replace('<', '_').replace('>', '_').replace('|', '_')}" + + # 创建Word文档 + doc = Document() + doc.add_heading(article.title, 0) + + # 添加文章元数据 + doc.add_paragraph(f"网站: {article.website.name}") + doc.add_paragraph(f"URL: {article.url}") + doc.add_paragraph( + f"发布时间: {article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else 'N/A'}") + doc.add_paragraph(f"创建时间: {article.created_at.strftime('%Y-%m-%d %H:%M:%S')}") + + # 添加文章内容 + doc.add_heading('内容', level=2) + # 简单处理HTML内容,移除标签并处理图片 + from bs4 import BeautifulSoup + soup = BeautifulSoup(article.content, 'html.parser') + + # 处理内容中的图片 + for img in soup.find_all('img'): + src = img.get('src', '') + if src: + # 尝试添加图片到文档 + try: + import os + from django.conf import settings + import requests + + # 构建完整的图片路径 + if src.startswith('http'): + # 网络图片 + response = requests.get(src, timeout=10) + image_stream = BytesIO(response.content) + doc.add_picture(image_stream, width=Inches(4.0)) + # 将网络文件保存到ZIP + zip_file.writestr(os.path.join(article_folder, 'media', os.path.basename(src)), response.content) + else: + # 本地图片 + full_path = os.path.join(settings.MEDIA_ROOT, src.lstrip('/')) + if os.path.exists(full_path): + doc.add_picture(full_path, width=Inches(4.0)) + # 添加文件到ZIP包 + zip_file.write(full_path, os.path.join(article_folder, 'media', src.lstrip('/'))) + except Exception as e: + # 如果添加图片失败,添加图片URL作为文本 + doc.add_paragraph(f"[图片: {src}]") + + # 移除原始img标签 + img.decompose() + + content_text = soup.get_text() + doc.add_paragraph(content_text) + + # 添加媒体文件信息并打包媒体文件 + if article.media_files: + doc.add_heading('媒体文件', level=2) + for media_file in article.media_files: + try: + import os + from django.conf import settings + + full_path = os.path.join(settings.MEDIA_ROOT, media_file) + # 检查文件扩展名以确定处理方式 + file_extension = os.path.splitext(media_file)[1].lower() + + # 图片文件处理 + if file_extension in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']: + if os.path.exists(full_path): + # 添加图片到文档 + doc.add_picture(full_path, width=Inches(4.0)) + # 添加文件到ZIP包 + zip_file.write(full_path, os.path.join(article_folder, 'media', media_file)) + else: + # 如果是URL格式的媒体文件 + if media_file.startswith('http'): + response = requests.get(media_file, timeout=10) + image_stream = BytesIO(response.content) + doc.add_picture(image_stream, width=Inches(4.0)) + # 将网络文件保存到ZIP + zip_file.writestr(os.path.join(article_folder, 'media', os.path.basename(media_file)), response.content) + else: + doc.add_paragraph(media_file) + # 视频文件处理 + elif file_extension in ['.mp4', '.avi', '.mov', '.wmv', '.flv', '.webm']: + # 视频文件只添加到ZIP包中,不在Word文档中显示 + if os.path.exists(full_path): + # 添加文件到ZIP包 + zip_file.write(full_path, os.path.join(article_folder, 'media', media_file)) + # 在Word文档中添加视频文件信息 + doc.add_paragraph(f"[视频文件: {media_file}]") + else: + # 如果是URL格式的媒体文件 + if media_file.startswith('http'): + # 将网络文件保存到ZIP + response = requests.get(media_file, timeout=10) + zip_file.writestr(os.path.join(article_folder, 'media', os.path.basename(media_file)), response.content) + doc.add_paragraph(f"[视频文件: {media_file}]") + else: + doc.add_paragraph(media_file) + # 其他文件类型 + else: + if os.path.exists(full_path): + # 添加文件到ZIP包 + zip_file.write(full_path, os.path.join(article_folder, 'media', media_file)) + doc.add_paragraph(f"[文件: {media_file}]") + else: + # 如果是URL格式的媒体文件 + if media_file.startswith('http'): + response = requests.get(media_file, timeout=10) + zip_file.writestr(os.path.join(article_folder, 'media', os.path.basename(media_file)), response.content) + doc.add_paragraph(f"[文件: {media_file}]") + else: + doc.add_paragraph(media_file) + except Exception as e: + doc.add_paragraph(media_file) + + # 保存每篇文章的Word文档到ZIP文件中的对应文件夹 + doc_buffer = BytesIO() + doc.save(doc_buffer) + doc_buffer.seek(0) + zip_file.writestr(os.path.join(article_folder, f'{article.title.replace("/", "_")}.docx'), doc_buffer.read()) + + # 创建HttpResponse + zip_buffer.seek(0) + from django.http import HttpResponse + response = HttpResponse(zip_buffer.getvalue(), content_type='application/zip') + response['Content-Disposition'] = 'attachment; filename=articles_export.zip' + return response + + export_with_media.short_description = "导出选中文章及媒体文件(ZIP包)" + # 为不同网站创建专门的文章管理类 class NewsCnArticleAdmin(admin.ModelAdmin): @@ -342,8 +514,3 @@ class DongfangyancaoArticleAdmin(admin.ModelAdmin): # 在各自的管理站点中注册模型 -news_cn_admin.register(Website, WebsiteAdmin) -news_cn_admin.register(Article, NewsCnArticleAdmin) - -dongfangyancao_admin.register(Website, WebsiteAdmin) -dongfangyancao_admin.register(Article, DongfangyancaoArticleAdmin) diff --git a/core/management/commands/crawl_xinhua_bak.py b/core/management/commands/crawl_xinhua_bak.py deleted file mode 100644 index 99aaa98..0000000 --- a/core/management/commands/crawl_xinhua_bak.py +++ /dev/null @@ -1,21 +0,0 @@ -from django.core.management.base import BaseCommand -from core.models import Website -from core.utils import crawl_xinhua_list - -class Command(BaseCommand): - help = '批量爬取新华网文章' - - def handle(self, *args, **options): - # 添加使用标记,确认该命令是否被调用 - self.stdout.write(self.style.WARNING("crawl_xinhua command is being used")) - - list_url = "https://www.news.cn/legal/index.html" - try: - website = Website.objects.get(base_url="https://www.news.cn/") - except Website.DoesNotExist: - self.stdout.write(self.style.ERROR("网站 https://www.news.cn/ 不存在,请先后台添加")) - return - - self.stdout.write(f"开始爬取文章列表页: {list_url}") - crawl_xinhua_list(list_url, website) - self.stdout.write(self.style.SUCCESS("批量爬取完成")) \ No newline at end of file diff --git a/core/management/commands/export_articles.py b/core/management/commands/export_articles.py index 7f6912e..f144f0b 100644 --- a/core/management/commands/export_articles.py +++ b/core/management/commands/export_articles.py @@ -13,16 +13,20 @@ class Command(BaseCommand): help = '导出文章及相关的媒体文件(图片、视频等)' def add_arguments(self, parser): - parser.add_argument('--format', type=str, default='json', help='导出格式: json 或 csv') + parser.add_argument('--format', type=str, default='docx', help='导出格式: json、csv 或 docx') parser.add_argument('--website', type=str, help='指定网站名称导出特定网站的文章') parser.add_argument('--output', type=str, default='', help='输出文件路径') - parser.add_argument('--include-media', action='store_true', help='包含媒体文件') + # 修改默认值为True,使包含媒体文件成为默认行为 + parser.add_argument('--include-media', action='store_true', default=True, help='包含媒体文件') + # 添加参数控制是否打包成zip + parser.add_argument('--no-zip', action='store_true', help='不打包成zip文件') def handle(self, *args, **options): format_type = options['format'].lower() website_name = options['website'] output_path = options['output'] include_media = options['include_media'] + no_zip = options['no_zip'] # 获取文章查询集 articles = Article.objects.all() @@ -65,20 +69,26 @@ class Command(BaseCommand): # 确定输出路径 if not output_path: timestamp = timezone.now().strftime('%Y%m%d_%H%M%S') - if include_media: - output_path = f'articles_export_{timestamp}.zip' - else: - output_path = f'articles_export_{timestamp}.{format_type}' + # 默认导出为zip格式 + output_path = f'articles_export_{timestamp}.zip' # 执行导出 - if include_media: - self.export_with_media(articles_data, media_files, output_path, format_type) + # 如果需要包含媒体文件或格式为docx,则默认打包成zip + if include_media or format_type == 'docx': + if no_zip: + if format_type == 'docx': + self.export_as_word(articles_data, output_path) + elif format_type == 'json': + self.export_as_json(articles_data, output_path) + elif format_type == 'csv': + self.export_as_csv(articles_data, output_path) + else: + self.export_with_media(articles_data, media_files, output_path, format_type) else: if format_type == 'json': self.export_as_json(articles_data, output_path) elif format_type == 'csv': self.export_as_csv(articles_data, output_path) - # 添加Word格式导出支持 elif format_type == 'docx': self.export_as_word(articles_data, output_path) else: @@ -220,7 +230,6 @@ class Command(BaseCommand): 'media_files'] else '' writer.writerow(article_data) zipf.writestr(data_filename, csv_buffer.getvalue()) - # 添加Word格式支持 elif format_type == 'docx': # 创建Word文档并保存到ZIP try: diff --git a/core/templates/admin/core/article/change_list.html b/core/templates/admin/core/article/change_list.html new file mode 100644 index 0000000..949e5f7 --- /dev/null +++ b/core/templates/admin/core/article/change_list.html @@ -0,0 +1,19 @@ +{% extends "admin/change_list.html" %} +{% load admin_urls %} + +{% block object-tools %} + {{ block.super }} +