fix URLS error and support download file

2025-09-23 14:45:27 +08:00
parent e51154bb29
commit 7a4045048e
7 changed files with 76 additions and 4 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -174,3 +174,5 @@ cython_debug/
 # PyPI configuration file
 .pypirc

+
+media/
--- a/celerybeat-schedule.db
+++ b/celerybeat-schedule.db
--- a/crawler/admin.py
+++ b/crawler/admin.py
@@ -83,7 +83,7 @@ class CrawledContentAdmin(admin.ModelAdmin):
    def preview_content(self, obj):
        """预览内容"""
        if obj.is_local_saved:
-            url = reverse('admin:crawled_content_preview', args=[obj.id])
+            url = reverse('preview_crawled_content', args=[obj.id])
            return format_html(
                '<a href="{}" target="_blank" class="button">预览文章</a>',
                url
--- a/crawler/templates/crawler/dashboard.html
+++ b/crawler/templates/crawler/dashboard.html
@@ -186,6 +186,9 @@
                                    {% endif %}
                                </small>
                                <div>
+                                    <a href="{% url 'download_crawled_content' content.id %}" class="btn btn-sm btn-outline-primary" title="下载">
+                                        <i class="bi bi-download"></i>
+                                    </a>
                                    {% for keyword in content.keywords_matched|split:"," %}
                                        <span class="keyword-badge">{{ keyword|strip }}</span>
                                    {% endfor %}
--- a/crawler/urls.py
+++ b/crawler/urls.py
@@ -5,4 +5,5 @@ urlpatterns = [
    path('', views.dashboard, name='dashboard'),
    path('search/', views.search_page, name='search'),
    path('crawled-content/<int:content_id>/preview/', views.preview_crawled_content, name='preview_crawled_content'),
+    path('crawled-content/<int:content_id>/download/', views.download_crawled_content, name='download_crawled_content'),
 ]
--- a/crawler/views.py
+++ b/crawler/views.py
@@ -1,9 +1,10 @@
 from django.shortcuts import render, get_object_or_404
-from django.http import HttpResponse
+from django.http import HttpResponse, Http404
 from django.db.models import Q, Count
 from django.conf import settings
 from django.utils import timezone
-from .models import Website, CrawlTask, CrawledContent, CrawlLog, SearchKeyword
+from django.core.files.storage import default_storage
+from .models import Website, CrawlTask, CrawledContent, CrawlLog, SearchKeyword, MediaFile
 from rest_framework import viewsets, filters
 from rest_framework.decorators import action
 from rest_framework.response import Response
@@ -15,6 +16,12 @@ import json
 from django.core.paginator import Paginator
 from django.db.models.functions import TruncDate
 from django.db.models import Count
+import os
+import tempfile
+import zipfile
+from io import BytesIO
+from docx import Document
+from django.core.files.base import ContentFile


 def dashboard(request):
@@ -289,4 +296,62 @@ def preview_crawled_content(request, content_id):
 </body>
 </html>
    """
-    return HttpResponse(html_content, content_type='text/html; charset=utf-8')
+    return HttpResponse(html_content, content_type='text/html; charset=utf-8')
+
+
+def download_crawled_content(request, content_id):
+    """下载文章内容为压缩包（包含Word文档和媒体文件）"""
+    content = get_object_or_404(CrawledContent, id=content_id)
+    
+    # 创建内存中的字节流用于存储zip文件
+    zip_buffer = BytesIO()
+    
+    # 创建zip文件
+    with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
+        # 创建Word文档
+        doc = Document()
+        doc.add_heading(content.title, 0)
+        
+        # 添加元数据
+        doc.add_paragraph(f'来源网站: {content.website.name} ({content.website.region})')
+        doc.add_paragraph(f'原始链接: {content.url}')
+        doc.add_paragraph(f'发布时间: {content.publish_date or "未知"}')
+        doc.add_paragraph(f'作者: {content.author or "未知"}')
+        doc.add_paragraph(f'匹配关键字: {content.keywords_matched}')
+        doc.add_paragraph(f'爬取时间: {content.created_at}')
+        
+        # 添加内容
+        doc.add_heading('正文', level=1)
+        for paragraph in content.content.split('\n\n'):
+            if paragraph.strip():
+                doc.add_paragraph(paragraph.strip())
+        
+        # 保存Word文档到内存
+        doc_buffer = BytesIO()
+        doc.save(doc_buffer)
+        doc_buffer.seek(0)
+        
+        # 添加Word文档到zip文件
+        zip_file.writestr(f"{content.title[:50]}.docx", doc_buffer.getvalue())
+        
+        # 添加媒体文件到zip文件
+        media_files = content.media_files.all()
+        for media_file in media_files:
+            try:
+                # 获取媒体文件的本地路径
+                if media_file.local_file and default_storage.exists(media_file.local_file.name):
+                    # 读取文件内容
+                    file_content = default_storage.open(media_file.local_file.name).read()
+                    # 添加到zip文件中
+                    zip_file.writestr(f"media/{os.path.basename(media_file.local_file.name)}", file_content)
+            except Exception as e:
+                # 如果文件无法读取，记录错误但继续处理其他文件
+                pass
+    
+    # 准备响应
+    zip_buffer.seek(0)
+    response = HttpResponse(zip_buffer.getvalue(), content_type='application/zip')
+    filename = f"{content.title[:50]}.zip"
+    response['Content-Disposition'] = f'attachment; filename="{filename}"'
+    
+    return response
--- a/requirements.txt
+++ b/requirements.txt
@@ -27,3 +27,4 @@ tzdata==2025.2
 urllib3==2.5.0
 vine==5.1.0
 wcwidth==0.2.14
+python-docx==1.2.0