Support first case: 1. Add filters in website; 2. Add export all file in admin

This commit is contained in:
2025-09-12 03:37:26 +08:00
parent 922a88048b
commit a4891b1c30
9 changed files with 439 additions and 720 deletions

View File

@@ -297,6 +297,77 @@ class ArticleAdmin(admin.ModelAdmin):
}),
)
# 添加导出选中文章的操作
actions = ['export_selected_articles']
def export_selected_articles(self, request, queryset):
"""
导出选中的文章为ZIP文件
"""
import zipfile
from django.http import HttpResponse
from io import BytesIO
from django.conf import settings
import os
from bs4 import BeautifulSoup
from docx import Document
# 创建内存中的ZIP文件
zip_buffer = BytesIO()
with zipfile.ZipFile(zip_buffer, 'w') as zip_file:
# 为每篇文章创建文件夹并添加内容
for article in queryset:
# 创建文章文件夹名称
article_folder = f"article_{article.id}_{article.title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('*', '_').replace('?', '_').replace('"', '_').replace('<', '_').replace('>', '_').replace('|', '_')}"
# 创建Word文档
doc = Document()
doc.add_heading(article.title, 0)
# 添加文章信息
doc.add_paragraph(f"网站: {article.website.name if article.website else ''}")
doc.add_paragraph(f"URL: {article.url}")
doc.add_paragraph(f"发布时间: {article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else ''}")
doc.add_paragraph(f"创建时间: {article.created_at.strftime('%Y-%m-%d %H:%M:%S') if article.created_at else ''}")
# 添加内容标题
doc.add_heading('内容:', level=1)
# 处理HTML内容
soup = BeautifulSoup(article.content, 'html.parser')
content_text = soup.get_text()
doc.add_paragraph(content_text)
# 将Word文档保存到内存中
doc_buffer = BytesIO()
doc.save(doc_buffer)
doc_buffer.seek(0)
# 将Word文档添加到ZIP文件
zip_file.writestr(os.path.join(article_folder, f'{article.title.replace("/", "_")}.docx'), doc_buffer.getvalue())
# 添加媒体文件到ZIP包
if article.media_files:
for media_file in article.media_files:
try:
full_path = os.path.join(settings.MEDIA_ROOT, media_file)
if os.path.exists(full_path):
# 添加文件到ZIP包
zip_file.write(full_path, os.path.join(article_folder, 'media', os.path.basename(media_file)))
except Exception as e:
# 如果添加媒体文件失败,继续处理其他文件
pass
# 创建HttpResponse
zip_buffer.seek(0)
response = HttpResponse(zip_buffer.getvalue(), content_type='application/zip')
response['Content-Disposition'] = 'attachment; filename=selected_articles.zip'
return response
export_selected_articles.short_description = "导出所选的文章为ZIP"
def content_preview(self, obj):
"""内容预览"""
return obj.content[:100] + '...' if len(obj.content) > 100 else obj.content
@@ -340,44 +411,69 @@ class ArticleAdmin(admin.ModelAdmin):
actions_column.short_description = '操作'
class CrawlerStatusAdmin(admin.ModelAdmin):
"""爬虫状态管理"""
change_list_template = 'admin/crawler_status.html'
def changelist_view(self, request, extra_context=None):
"""爬虫状态视图"""
# 获取分布式爬虫状态
nodes = distributed_crawler.get_available_nodes()
node_statuses = []
for node_id in nodes:
status = distributed_crawler.get_node_status(node_id)
node_statuses.append(status)
# 获取最近的批次
batches = distributed_crawler.get_all_batches()[:10]
# 获取任务统计
task_stats = {
'active_tasks': len([n for n in node_statuses if n['active_tasks'] > 0]),
'total_nodes': len(nodes),
'total_batches': len(batches),
}
extra_context = extra_context or {}
extra_context.update({
'nodes': node_statuses,
'batches': batches,
'task_stats': task_stats,
})
return super().changelist_view(request, extra_context)
#class CrawlerStatusAdmin(admin.ModelAdmin):
# """爬虫状态管理"""
# change_list_template = 'admin/crawler_status.html'
#
# def changelist_view(self, request, extra_context=None):
# """爬虫状态视图"""
# # 获取分布式爬虫状态
# nodes = distributed_crawler.get_available_nodes()
# node_statuses = []
#
# for node_id in nodes:
# status = distributed_crawler.get_node_status(node_id)
# node_statuses.append(status)
#
# # 获取最近的批次
# batches = distributed_crawler.get_all_batches()[:10]
#
# # 获取任务统计
# task_stats = {
# 'active_tasks': len([n for n in node_statuses if n['active_tasks'] > 0]),
# 'total_nodes': len(nodes),
# 'total_batches': len(batches),
# }
#
# extra_context = extra_context or {}
# extra_context.update({
# 'nodes': node_statuses,
# 'batches': batches,
# 'task_stats': task_stats,
# })
#
# return super().changelist_view(request, extra_context)
#
# 注册管理类
admin.site.register(Website, WebsiteAdmin)
admin.site.register(Article, ArticleAdmin)
# 隐藏Celery Results管理功能
# 禁用django_celery_results应用的自动注册
try:
from django_celery_results.models import TaskResult, GroupResult
from django_celery_results.admin import TaskResultAdmin, GroupResultAdmin
admin.site.unregister(TaskResult)
admin.site.unregister(GroupResult)
except:
pass
# 隐藏Celery Beat周期任务管理功能
# 禁用django_celery_beat应用的自动注册
try:
from django_celery_beat.models import PeriodicTask, ClockedSchedule, CrontabSchedule, SolarSchedule, IntervalSchedule
admin.site.unregister(PeriodicTask)
admin.site.unregister(ClockedSchedule)
admin.site.unregister(CrontabSchedule)
admin.site.unregister(SolarSchedule)
admin.site.unregister(IntervalSchedule)
except:
pass
# 自定义管理站点标题
admin.site.site_header = 'Green Classroom 管理系统'
admin.site.site_title = 'Green Classroom'

View File

@@ -258,6 +258,18 @@
{% if selected_website and selected_website.id == website.id %}class="active" {% endif %}>{{ website.name }}</a>
{% endfor %}
</div>
<!-- 修改:按媒体类型筛选 -->
<div class="filters">
<strong>按媒体类型筛选:</strong>
<a href="?{% if selected_website %}website={{ selected_website.id }}&{% endif %}{% if search_query %}q={{ search_query }}&{% endif %}media_type=all"
{% if not request.GET.media_type or request.GET.media_type == 'all' %}class="active"{% endif %}>全部</a>
<a href="?{% if selected_website %}website={{ selected_website.id }}&{% endif %}{% if search_query %}q={{ search_query }}&{% endif %}media_type=text_only"
{% if request.GET.media_type == 'text_only' %}class="active"{% endif %}>纯文本</a>
<a href="?{% if selected_website %}website={{ selected_website.id }}&{% endif %}{% if search_query %}q={{ search_query }}&{% endif %}media_type=with_images"
{% if request.GET.media_type == 'with_images' %}class="active"{% endif %}>图片</a>
<a href="?{% if selected_website %}website={{ selected_website.id }}&{% endif %}{% if search_query %}q={{ search_query }}&{% endif %}media_type=with_videos"
{% if request.GET.media_type == 'with_videos' %}class="active"{% endif %}>视频</a>
</div>
</div>
<!-- 主内容区域 -->
@@ -278,6 +290,10 @@
<button id="exportCsvBtn" class="export-btn" disabled>导出为CSV</button>
<!-- 新增:导出为ZIP包按钮 -->
<button id="exportZipBtn" class="export-btn" disabled>导出为ZIP包</button>
<!-- 删除:按类型导出按钮 -->
<!-- <button id="exportTextOnlyBtn" class="export-btn">导出纯文本</button>
<button id="exportWithImagesBtn" class="export-btn">导出含图片</button>
<button id="exportWithVideosBtn" class="export-btn">导出含视频</button> -->
</div>
<ul>
@@ -361,6 +377,10 @@
const deselectAllBtn = document.getElementById('deselectAllBtn');
// 新增:获取ZIP导出按钮元素
const exportZipBtn = document.getElementById('exportZipBtn');
// const exportTextOnlyBtn = document.getElementById('exportTextOnlyBtn');
// const exportWithImagesBtn = document.getElementById('exportWithImagesBtn');
// const exportWithVideosBtn = document.getElementById('exportWithVideosBtn');
// 更新导出按钮状态
function updateExportButtons() {
@@ -504,9 +524,56 @@
alert('导出失败: ' + error);
});
});
// exportTextOnlyBtn.addEventListener('click', () => {
// exportByMediaType('text_only');
// });
// exportWithImagesBtn.addEventListener('click', () => {
// exportByMediaType('with_images');
// });
// exportWithVideosBtn.addEventListener('click', () => {
// exportByMediaType('with_videos');
// });
// function exportByMediaType(mediaType) {
// // 发送POST请求按类型导出文章
// fetch('{% url "export_articles_by_type" %}', {
// method: 'POST',
// headers: {
// 'Content-Type': 'application/json',
// 'X-CSRFToken': '{{ csrf_token }}'
// },
// body: JSON.stringify({
// media_type: mediaType,
// format: 'zip'
// })
// })
// .then(response => {
// if (response.ok) {
// return response.blob();
// }
// throw new Error('导出失败');
// })
// .then(blob => {
// const url = window.URL.createObjectURL(blob);
// const a = document.createElement('a');
// a.href = url;
// a.download = `articles_${mediaType}.zip`;
// document.body.appendChild(a);
// a.click();
// window.URL.revokeObjectURL(url);
// document.body.removeChild(a);
// })
// .catch(error => {
// alert('导出失败: ' + error);
// });
// }
// 初始化导出按钮状态
updateExportButtons();
</script>
</body>
</html>
</html>

View File

@@ -1,24 +1,12 @@
from django.urls import path
from . import views, api
from . import views
urlpatterns = [
# 原有视图
path('', views.article_list, name='article_list'),
path('article/<int:article_id>/', views.article_detail, name='article_detail'),
# API接口
path('api/health/', api.HealthView.as_view(), name='api_health'),
path('api/websites/', api.WebsitesView.as_view(), name='api_websites'),
path('api/websites/<int:website_id>/', api.api_website_detail, name='api_website_detail'),
path('api/websites/<int:website_id>/crawl/', api.api_crawl_website, name='api_crawl_website'),
path('api/articles/', api.api_articles, name='api_articles'),
path('api/articles/<int:article_id>/', api.api_article_detail, name='api_article_detail'),
path('api/crawler/status/', api.api_crawler_status, name='api_crawler_status'),
path('api/crawler/distributed/', api.api_start_distributed_crawl, name='api_start_distributed_crawl'),
path('api/crawler/batch/<str:batch_id>/', api.api_batch_status, name='api_batch_status'),
path('api/cleanup/', api.api_cleanup_articles, name='api_cleanup_articles'),
path('api/stats/', api.api_stats, name='api_stats'),
# 添加导出文章的URL
path('api/export/', api.export_articles, name='export_articles'),
]
path('run-crawler/', views.run_crawler, name='run_crawler'),
path('crawler-status/', views.crawler_status, name='crawler_status'),
path('pause-crawler/', views.pause_crawler, name='pause_crawler'),
path('export-articles/', views.export_articles, name='export_articles'),
path('export-articles-by-type/', views.export_articles_by_type, name='export_articles_by_type'),
]

View File

@@ -38,6 +38,26 @@ def article_list(request):
if search_query:
articles = articles.filter(title__icontains=search_query)
# 新增:处理媒体类型筛选
media_type = request.GET.get('media_type', 'all')
if media_type == 'text_only':
# 纯文本文章(没有媒体文件)
articles = articles.filter(media_files__isnull=True) | articles.filter(media_files=[])
elif media_type == 'with_images':
# 包含图片的文章
articles = articles.filter(media_files__icontains='.jpg') | \
articles.filter(media_files__icontains='.jpeg') | \
articles.filter(media_files__icontains='.png') | \
articles.filter(media_files__icontains='.gif')
elif media_type == 'with_videos':
# 包含视频的文章
articles = articles.filter(media_files__icontains='.mp4') | \
articles.filter(media_files__icontains='.avi') | \
articles.filter(media_files__icontains='.mov') | \
articles.filter(media_files__icontains='.wmv') | \
articles.filter(media_files__icontains='.flv') | \
articles.filter(media_files__icontains='.webm')
# 按创建时间倒序排列
articles = articles.order_by('-created_at')
@@ -413,3 +433,204 @@ def export_articles(request):
except Exception as e:
return HttpResponse(f'导出失败: {str(e)}', status=500)
# 新增:按媒体类型导出文章视图
@csrf_exempt
@require_http_methods(["POST"])
def export_articles_by_type(request):
try:
# 解析请求数据
data = json.loads(request.body)
media_type = data.get('media_type', 'all')
format_type = data.get('format', 'zip')
# 根据媒体类型筛选文章
if media_type == 'text_only':
# 纯文本文章(没有媒体文件或媒体文件为空)
articles = Article.objects.filter(media_files__isnull=True) | Article.objects.filter(media_files=[])
elif media_type == 'with_images':
# 包含图片的文章
articles = Article.objects.filter(media_files__icontains='.jpg') | \
Article.objects.filter(media_files__icontains='.jpeg') | \
Article.objects.filter(media_files__icontains='.png') | \
Article.objects.filter(media_files__icontains='.gif')
elif media_type == 'with_videos':
# 包含视频的文章
articles = Article.objects.filter(media_files__icontains='.mp4') | \
Article.objects.filter(media_files__icontains='.avi') | \
Article.objects.filter(media_files__icontains='.mov') | \
Article.objects.filter(media_files__icontains='.wmv') | \
Article.objects.filter(media_files__icontains='.flv') | \
Article.objects.filter(media_files__icontains='.webm')
else:
# 所有文章
articles = Article.objects.all()
# 去重处理
articles = articles.distinct()
if not articles.exists():
return HttpResponse('没有符合条件的文章', status=400)
# 导出为ZIP格式
if format_type == 'zip':
import zipfile
from io import BytesIO
from django.conf import settings
import os
# 创建内存中的ZIP文件
zip_buffer = BytesIO()
with zipfile.ZipFile(zip_buffer, 'w') as zip_file:
# 为每篇文章创建Word文档并添加到ZIP文件中
for article in articles:
# 为每篇文章创建单独的文件夹
article_folder = f"article_{article.id}_{article.title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('*', '_').replace('?', '_').replace('"', '_').replace('<', '_').replace('>', '_').replace('|', '_')}"
# 创建文章数据
article_data = {
'id': article.id,
'title': article.title,
'website': article.website.name,
'url': article.url,
'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None,
'content': article.content,
'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'),
'media_files': article.media_files
}
# 将文章数据保存为Word文件并添加到ZIP
try:
from docx import Document
from docx.shared import Inches
from io import BytesIO
from bs4 import BeautifulSoup
import requests
# 创建Word文档
doc = Document()
doc.add_heading(article.title, 0)
# 添加文章元数据
doc.add_paragraph(f"网站: {article.website.name}")
doc.add_paragraph(f"URL: {article.url}")
doc.add_paragraph(
f"发布时间: {article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else 'N/A'}")
doc.add_paragraph(f"创建时间: {article.created_at.strftime('%Y-%m-%d %H:%M:%S')}")
# 添加文章内容
doc.add_heading('内容', level=1)
# 处理HTML内容
soup = BeautifulSoup(article.content, 'html.parser')
# 处理内容中的图片
for img in soup.find_all('img'):
src = img.get('src', '')
if src:
try:
# 构建完整的图片路径
if src.startswith('http'):
# 网络图片
response = requests.get(src, timeout=10)
image_stream = BytesIO(response.content)
doc.add_picture(image_stream, width=Inches(4.0))
else:
# 本地图片
full_path = os.path.join(settings.MEDIA_ROOT, src.lstrip('/'))
if os.path.exists(full_path):
doc.add_picture(full_path, width=Inches(4.0))
except Exception as e:
# 如果添加图片失败添加图片URL作为文本
doc.add_paragraph(f"[图片: {src}]")
# 移除原始img标签
img.decompose()
content_text = soup.get_text()
doc.add_paragraph(content_text)
# 添加媒体文件信息
if article.media_files:
doc.add_heading('媒体文件', level=1)
for media_file in article.media_files:
try:
full_path = os.path.join(settings.MEDIA_ROOT, media_file)
if os.path.exists(full_path):
# 检查文件扩展名以确定处理方式
file_extension = os.path.splitext(media_file)[1].lower()
# 图片文件处理
if file_extension in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']:
doc.add_picture(full_path, width=Inches(4.0))
# 视频文件处理
elif file_extension in ['.mp4', '.avi', '.mov', '.wmv', '.flv', '.webm']:
doc.add_paragraph(f"[视频文件: {media_file}]")
# 其他文件类型
else:
doc.add_paragraph(f"[文件: {media_file}]")
else:
# 如果是URL格式的媒体文件
if media_file.startswith('http'):
response = requests.get(media_file, timeout=10)
file_extension = os.path.splitext(media_file)[1].lower()
# 图片文件处理
if file_extension in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']:
image_stream = BytesIO(response.content)
doc.add_picture(image_stream, width=Inches(4.0))
else:
doc.add_paragraph(f"[文件: {media_file}]")
else:
doc.add_paragraph(media_file)
except Exception as e:
doc.add_paragraph(media_file)
# 保存Word文档到内存
doc_buffer = BytesIO()
doc.save(doc_buffer)
doc_buffer.seek(0)
# 将Word文档添加到ZIP包
zip_file.writestr(os.path.join(article_folder, f'{article.title.replace("/", "_")}.docx'),
doc_buffer.read())
except ImportError:
# 如果没有安装python-docx库回退到JSON格式
json_data = json.dumps(article_data, ensure_ascii=False, indent=2)
zip_file.writestr(os.path.join(article_folder, f'{article.title.replace("/", "_")}.json'),
json_data)
# 添加媒体文件到ZIP包
if article.media_files:
for media_file in article.media_files:
try:
full_path = os.path.join(settings.MEDIA_ROOT, media_file)
if os.path.exists(full_path):
# 添加文件到ZIP包
zip_file.write(full_path, os.path.join(article_folder, 'media', media_file))
else:
# 如果是URL格式的媒体文件
if media_file.startswith('http'):
import requests
response = requests.get(media_file, timeout=10)
zip_file.writestr(
os.path.join(article_folder, 'media', os.path.basename(media_file)),
response.content)
except Exception as e:
# 如果添加媒体文件失败,继续处理其他文件
pass
# 创建HttpResponse
zip_buffer.seek(0)
response = HttpResponse(zip_buffer.getvalue(), content_type='application/zip')
response['Content-Disposition'] = f'attachment; filename=articles_{media_type}.zip'
return response
else:
return HttpResponse('不支持的格式', status=400)
except Exception as e:
return HttpResponse(f'导出失败: {str(e)}', status=500)