Add export into front

This commit is contained in:
2025-08-13 00:26:39 +08:00
parent baea50bfa0
commit 5e396796ca
9 changed files with 1272 additions and 175 deletions

View File

@@ -8,24 +8,53 @@ from django.http import HttpResponseRedirect
import csv
from django.http import HttpResponse
import json
# 添加视图函数需要的导入
from django.shortcuts import render, redirect
from django.urls import path
from django.contrib import admin
from django.http import JsonResponse
from django.views.decorators.http import require_http_methods
from django.core.management import call_command
import threading
import uuid
from django.utils import timezone
# 创建自定义管理站点
class NewsCnAdminSite(AdminSite):
site_header = "新华网管理后台"
site_title = "新华网管理"
index_title = "新华网内容管理"
class DongfangyancaoAdminSite(AdminSite):
site_header = "东方烟草报管理后台"
site_title = "东方烟草报管理"
index_title = "东方烟草报内容管理"
# 实例化管理站点
news_cn_admin = NewsCnAdminSite(name='news_cn_admin')
dongfangyancao_admin = DongfangyancaoAdminSite(name='dongfangyancao_admin')
# 添加运行爬虫的视图函数
def run_crawler_view(request):
"""
管理后台运行爬虫的视图
"""
if request.method == 'POST':
website_name = request.POST.get('website_name')
if not website_name:
messages.error(request, '请选择要爬取的网站')
return redirect('admin:core_article_changelist')
try:
# 根据网站名称确定要执行的爬虫命令
if website_name == 'crawl_xinhua':
crawler_name = 'crawl_xinhua'
elif website_name == 'crawl_dongfangyancao':
crawler_name = 'crawl_dongfangyancao'
elif website_name == 'crawl_articles':
crawler_name = 'crawl_articles'
else:
# 对于其他网站,使用通用爬虫命令
crawler_name = 'crawl_articles'
# 运行爬虫命令不传递website_name作为参数
call_command(crawler_name)
messages.success(request, f'成功执行爬虫: {crawler_name}')
except Exception as e:
messages.error(request, f'执行爬虫失败: {str(e)}')
return redirect('admin:core_article_changelist')
@admin.register(Website)
@@ -39,22 +68,16 @@ class ArticleAdmin(admin.ModelAdmin):
list_display = ('title', 'website', 'pub_date')
search_fields = ('title', 'content')
# 添加动作选项
actions = ['delete_selected_articles', 'delete_dongfangyancao_articles', 'export_as_csv', 'export_as_json',
'export_as_word']
actions = ['delete_selected_articles', 'export_as_csv', 'export_as_json',
'export_as_word', 'export_with_media']
def delete_dongfangyancao_articles(self, request, queryset):
"""一键删除东方烟草报的所有文章"""
# 获取东方烟草报网站对象
try:
dongfangyancao_website = Website.objects.get(name='东方烟草报')
# 删除所有东方烟草报的文章
deleted_count = Article.objects.filter(website=dongfangyancao_website).delete()[0]
self.message_user(request, f"成功删除 {deleted_count} 篇东方烟草报文章", messages.SUCCESS)
except Website.DoesNotExist:
self.message_user(request, "未找到东方烟草报网站配置", messages.ERROR)
# 设置动作的显示名称
delete_dongfangyancao_articles.short_description = "删除所有东方烟草报文章"
# 重写get_urls方法添加自定义URL
def get_urls(self):
urls = super().get_urls()
custom_urls = [
path('run-crawler/', self.admin_site.admin_view(run_crawler_view), name='run_crawler'),
]
return custom_urls + urls
def export_as_csv(self, request, queryset):
"""导出选中的文章为CSV格式"""
@@ -205,6 +228,155 @@ class ArticleAdmin(admin.ModelAdmin):
export_as_word.short_description = "导出选中文章为Word格式"
def export_with_media(self, request, queryset):
"""导出选中的文章及媒体文件为ZIP包"""
try:
from docx import Document
from io import BytesIO
from docx.shared import Inches
import zipfile
except ImportError:
self.message_user(request, "缺少必要库,请安装: pip install python-docx", messages.ERROR)
return
# 创建内存中的ZIP文件
zip_buffer = BytesIO()
with zipfile.ZipFile(zip_buffer, 'w') as zip_file:
for article in queryset:
# 为每篇文章创建单独的文件夹
article_folder = f"article_{article.id}_{article.title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('*', '_').replace('?', '_').replace('"', '_').replace('<', '_').replace('>', '_').replace('|', '_')}"
# 创建Word文档
doc = Document()
doc.add_heading(article.title, 0)
# 添加文章元数据
doc.add_paragraph(f"网站: {article.website.name}")
doc.add_paragraph(f"URL: {article.url}")
doc.add_paragraph(
f"发布时间: {article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else 'N/A'}")
doc.add_paragraph(f"创建时间: {article.created_at.strftime('%Y-%m-%d %H:%M:%S')}")
# 添加文章内容
doc.add_heading('内容', level=2)
# 简单处理HTML内容移除标签并处理图片
from bs4 import BeautifulSoup
soup = BeautifulSoup(article.content, 'html.parser')
# 处理内容中的图片
for img in soup.find_all('img'):
src = img.get('src', '')
if src:
# 尝试添加图片到文档
try:
import os
from django.conf import settings
import requests
# 构建完整的图片路径
if src.startswith('http'):
# 网络图片
response = requests.get(src, timeout=10)
image_stream = BytesIO(response.content)
doc.add_picture(image_stream, width=Inches(4.0))
# 将网络文件保存到ZIP
zip_file.writestr(os.path.join(article_folder, 'media', os.path.basename(src)), response.content)
else:
# 本地图片
full_path = os.path.join(settings.MEDIA_ROOT, src.lstrip('/'))
if os.path.exists(full_path):
doc.add_picture(full_path, width=Inches(4.0))
# 添加文件到ZIP包
zip_file.write(full_path, os.path.join(article_folder, 'media', src.lstrip('/')))
except Exception as e:
# 如果添加图片失败添加图片URL作为文本
doc.add_paragraph(f"[图片: {src}]")
# 移除原始img标签
img.decompose()
content_text = soup.get_text()
doc.add_paragraph(content_text)
# 添加媒体文件信息并打包媒体文件
if article.media_files:
doc.add_heading('媒体文件', level=2)
for media_file in article.media_files:
try:
import os
from django.conf import settings
full_path = os.path.join(settings.MEDIA_ROOT, media_file)
# 检查文件扩展名以确定处理方式
file_extension = os.path.splitext(media_file)[1].lower()
# 图片文件处理
if file_extension in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']:
if os.path.exists(full_path):
# 添加图片到文档
doc.add_picture(full_path, width=Inches(4.0))
# 添加文件到ZIP包
zip_file.write(full_path, os.path.join(article_folder, 'media', media_file))
else:
# 如果是URL格式的媒体文件
if media_file.startswith('http'):
response = requests.get(media_file, timeout=10)
image_stream = BytesIO(response.content)
doc.add_picture(image_stream, width=Inches(4.0))
# 将网络文件保存到ZIP
zip_file.writestr(os.path.join(article_folder, 'media', os.path.basename(media_file)), response.content)
else:
doc.add_paragraph(media_file)
# 视频文件处理
elif file_extension in ['.mp4', '.avi', '.mov', '.wmv', '.flv', '.webm']:
# 视频文件只添加到ZIP包中不在Word文档中显示
if os.path.exists(full_path):
# 添加文件到ZIP包
zip_file.write(full_path, os.path.join(article_folder, 'media', media_file))
# 在Word文档中添加视频文件信息
doc.add_paragraph(f"[视频文件: {media_file}]")
else:
# 如果是URL格式的媒体文件
if media_file.startswith('http'):
# 将网络文件保存到ZIP
response = requests.get(media_file, timeout=10)
zip_file.writestr(os.path.join(article_folder, 'media', os.path.basename(media_file)), response.content)
doc.add_paragraph(f"[视频文件: {media_file}]")
else:
doc.add_paragraph(media_file)
# 其他文件类型
else:
if os.path.exists(full_path):
# 添加文件到ZIP包
zip_file.write(full_path, os.path.join(article_folder, 'media', media_file))
doc.add_paragraph(f"[文件: {media_file}]")
else:
# 如果是URL格式的媒体文件
if media_file.startswith('http'):
response = requests.get(media_file, timeout=10)
zip_file.writestr(os.path.join(article_folder, 'media', os.path.basename(media_file)), response.content)
doc.add_paragraph(f"[文件: {media_file}]")
else:
doc.add_paragraph(media_file)
except Exception as e:
doc.add_paragraph(media_file)
# 保存每篇文章的Word文档到ZIP文件中的对应文件夹
doc_buffer = BytesIO()
doc.save(doc_buffer)
doc_buffer.seek(0)
zip_file.writestr(os.path.join(article_folder, f'{article.title.replace("/", "_")}.docx'), doc_buffer.read())
# 创建HttpResponse
zip_buffer.seek(0)
from django.http import HttpResponse
response = HttpResponse(zip_buffer.getvalue(), content_type='application/zip')
response['Content-Disposition'] = 'attachment; filename=articles_export.zip'
return response
export_with_media.short_description = "导出选中文章及媒体文件(ZIP包)"
# 为不同网站创建专门的文章管理类
class NewsCnArticleAdmin(admin.ModelAdmin):
@@ -342,8 +514,3 @@ class DongfangyancaoArticleAdmin(admin.ModelAdmin):
# 在各自的管理站点中注册模型
news_cn_admin.register(Website, WebsiteAdmin)
news_cn_admin.register(Article, NewsCnArticleAdmin)
dongfangyancao_admin.register(Website, WebsiteAdmin)
dongfangyancao_admin.register(Article, DongfangyancaoArticleAdmin)