diff --git a/.gitignore b/.gitignore index 575c1ad..34e8a05 100644 --- a/.gitignore +++ b/.gitignore @@ -174,3 +174,5 @@ cython_debug/ # PyPI configuration file .pypirc + +media/ diff --git a/celerybeat-schedule.db b/celerybeat-schedule.db new file mode 100644 index 0000000..e69de29 diff --git a/crawler/admin.py b/crawler/admin.py index b738c61..c891f8d 100644 --- a/crawler/admin.py +++ b/crawler/admin.py @@ -83,7 +83,7 @@ class CrawledContentAdmin(admin.ModelAdmin): def preview_content(self, obj): """预览内容""" if obj.is_local_saved: - url = reverse('admin:crawled_content_preview', args=[obj.id]) + url = reverse('preview_crawled_content', args=[obj.id]) return format_html( '预览文章', url diff --git a/crawler/templates/crawler/dashboard.html b/crawler/templates/crawler/dashboard.html index 348059c..5114a79 100644 --- a/crawler/templates/crawler/dashboard.html +++ b/crawler/templates/crawler/dashboard.html @@ -186,6 +186,9 @@ {% endif %}
+ + + {% for keyword in content.keywords_matched|split:"," %} {{ keyword|strip }} {% endfor %} diff --git a/crawler/urls.py b/crawler/urls.py index 0c34da0..03bb32b 100644 --- a/crawler/urls.py +++ b/crawler/urls.py @@ -5,4 +5,5 @@ urlpatterns = [ path('', views.dashboard, name='dashboard'), path('search/', views.search_page, name='search'), path('crawled-content//preview/', views.preview_crawled_content, name='preview_crawled_content'), + path('crawled-content//download/', views.download_crawled_content, name='download_crawled_content'), ] \ No newline at end of file diff --git a/crawler/views.py b/crawler/views.py index 70312f3..60ec1ba 100644 --- a/crawler/views.py +++ b/crawler/views.py @@ -1,9 +1,10 @@ from django.shortcuts import render, get_object_or_404 -from django.http import HttpResponse +from django.http import HttpResponse, Http404 from django.db.models import Q, Count from django.conf import settings from django.utils import timezone -from .models import Website, CrawlTask, CrawledContent, CrawlLog, SearchKeyword +from django.core.files.storage import default_storage +from .models import Website, CrawlTask, CrawledContent, CrawlLog, SearchKeyword, MediaFile from rest_framework import viewsets, filters from rest_framework.decorators import action from rest_framework.response import Response @@ -15,6 +16,12 @@ import json from django.core.paginator import Paginator from django.db.models.functions import TruncDate from django.db.models import Count +import os +import tempfile +import zipfile +from io import BytesIO +from docx import Document +from django.core.files.base import ContentFile def dashboard(request): @@ -289,4 +296,62 @@ def preview_crawled_content(request, content_id): """ - return HttpResponse(html_content, content_type='text/html; charset=utf-8') \ No newline at end of file + return HttpResponse(html_content, content_type='text/html; charset=utf-8') + + +def download_crawled_content(request, content_id): + """下载文章内容为压缩包(包含Word文档和媒体文件)""" + content = get_object_or_404(CrawledContent, id=content_id) + + # 创建内存中的字节流用于存储zip文件 + zip_buffer = BytesIO() + + # 创建zip文件 + with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file: + # 创建Word文档 + doc = Document() + doc.add_heading(content.title, 0) + + # 添加元数据 + doc.add_paragraph(f'来源网站: {content.website.name} ({content.website.region})') + doc.add_paragraph(f'原始链接: {content.url}') + doc.add_paragraph(f'发布时间: {content.publish_date or "未知"}') + doc.add_paragraph(f'作者: {content.author or "未知"}') + doc.add_paragraph(f'匹配关键字: {content.keywords_matched}') + doc.add_paragraph(f'爬取时间: {content.created_at}') + + # 添加内容 + doc.add_heading('正文', level=1) + for paragraph in content.content.split('\n\n'): + if paragraph.strip(): + doc.add_paragraph(paragraph.strip()) + + # 保存Word文档到内存 + doc_buffer = BytesIO() + doc.save(doc_buffer) + doc_buffer.seek(0) + + # 添加Word文档到zip文件 + zip_file.writestr(f"{content.title[:50]}.docx", doc_buffer.getvalue()) + + # 添加媒体文件到zip文件 + media_files = content.media_files.all() + for media_file in media_files: + try: + # 获取媒体文件的本地路径 + if media_file.local_file and default_storage.exists(media_file.local_file.name): + # 读取文件内容 + file_content = default_storage.open(media_file.local_file.name).read() + # 添加到zip文件中 + zip_file.writestr(f"media/{os.path.basename(media_file.local_file.name)}", file_content) + except Exception as e: + # 如果文件无法读取,记录错误但继续处理其他文件 + pass + + # 准备响应 + zip_buffer.seek(0) + response = HttpResponse(zip_buffer.getvalue(), content_type='application/zip') + filename = f"{content.title[:50]}.zip" + response['Content-Disposition'] = f'attachment; filename="{filename}"' + + return response diff --git a/requirements.txt b/requirements.txt index 0a88a0d..ed82c0e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -27,3 +27,4 @@ tzdata==2025.2 urllib3==2.5.0 vine==5.1.0 wcwidth==0.2.14 +python-docx==1.2.0 \ No newline at end of file