fix URLS error and support download file

2025-09-23 14:45:27 +08:00
parent e51154bb29
commit 7a4045048e
7 changed files with 76 additions and 4 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -174,3 +174,5 @@ cython_debug/
 # PyPI configuration file
 .pypirc
 media/
--- a/celerybeat-schedule.db
+++ b/celerybeat-schedule.db
--- a/crawler/admin.py
+++ b/crawler/admin.py
@@ -83,7 +83,7 @@ class CrawledContentAdmin(admin.ModelAdmin):
    def preview_content(self, obj):
        """预览内容"""
        if obj.is_local_saved:
-            url = reverse('admin:crawled_content_preview', args=[obj.id])
+            url = reverse('preview_crawled_content', args=[obj.id])
            return format_html(
                '<a href="{}" target="_blank" class="button">预览文章</a>',
                url
--- a/crawler/templates/crawler/dashboard.html
+++ b/crawler/templates/crawler/dashboard.html
@@ -186,6 +186,9 @@
                                    {% endif %}
                                </small>
                                <div>
                                    <a href="{% url 'download_crawled_content' content.id %}" class="btn btn-sm btn-outline-primary" title="下载">
                                        <i class="bi bi-download"></i>
                                    </a>
                                    {% for keyword in content.keywords_matched|split:"," %}
                                        <span class="keyword-badge">{{ keyword|strip }}</span>
                                    {% endfor %}
--- a/crawler/urls.py
+++ b/crawler/urls.py
@@ -5,4 +5,5 @@ urlpatterns = [
    path('', views.dashboard, name='dashboard'),
    path('search/', views.search_page, name='search'),
    path('crawled-content/<int:content_id>/preview/', views.preview_crawled_content, name='preview_crawled_content'),
    path('crawled-content/<int:content_id>/download/', views.download_crawled_content, name='download_crawled_content'),
 ]
--- a/crawler/views.py
+++ b/crawler/views.py
@@ -1,9 +1,10 @@
 from django.shortcuts import render, get_object_or_404
-from django.http import HttpResponse
+from django.http import HttpResponse, Http404
 from django.db.models import Q, Count
 from django.conf import settings
 from django.utils import timezone
-from .models import Website, CrawlTask, CrawledContent, CrawlLog, SearchKeyword
+from django.core.files.storage import default_storage
 from .models import Website, CrawlTask, CrawledContent, CrawlLog, SearchKeyword, MediaFile
 from rest_framework import viewsets, filters
 from rest_framework.decorators import action
 from rest_framework.response import Response
@@ -15,6 +16,12 @@ import json
 from django.core.paginator import Paginator
 from django.db.models.functions import TruncDate
 from django.db.models import Count
 import os
 import tempfile
 import zipfile
 from io import BytesIO
 from docx import Document
 from django.core.files.base import ContentFile
 def dashboard(request):
@@ -289,4 +296,62 @@ def preview_crawled_content(request, content_id):
 </body>
 </html>
    """
-    return HttpResponse(html_content, content_type='text/html; charset=utf-8')
+    return HttpResponse(html_content, content_type='text/html; charset=utf-8')
 def download_crawled_content(request, content_id):
    """下载文章内容为压缩包（包含Word文档和媒体文件）"""
    content = get_object_or_404(CrawledContent, id=content_id)
    # 创建内存中的字节流用于存储zip文件
    zip_buffer = BytesIO()
    # 创建zip文件
    with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
        # 创建Word文档
        doc = Document()
        doc.add_heading(content.title, 0)
        # 添加元数据
        doc.add_paragraph(f'来源网站: {content.website.name} ({content.website.region})')
        doc.add_paragraph(f'原始链接: {content.url}')
        doc.add_paragraph(f'发布时间: {content.publish_date or "未知"}')
        doc.add_paragraph(f'作者: {content.author or "未知"}')
        doc.add_paragraph(f'匹配关键字: {content.keywords_matched}')
        doc.add_paragraph(f'爬取时间: {content.created_at}')
        # 添加内容
        doc.add_heading('正文', level=1)
        for paragraph in content.content.split('\n\n'):
            if paragraph.strip():
                doc.add_paragraph(paragraph.strip())
        # 保存Word文档到内存
        doc_buffer = BytesIO()
        doc.save(doc_buffer)
        doc_buffer.seek(0)
        # 添加Word文档到zip文件
        zip_file.writestr(f"{content.title[:50]}.docx", doc_buffer.getvalue())
        # 添加媒体文件到zip文件
        media_files = content.media_files.all()
        for media_file in media_files:
            try:
                # 获取媒体文件的本地路径
                if media_file.local_file and default_storage.exists(media_file.local_file.name):
                    # 读取文件内容
                    file_content = default_storage.open(media_file.local_file.name).read()
                    # 添加到zip文件中
                    zip_file.writestr(f"media/{os.path.basename(media_file.local_file.name)}", file_content)
            except Exception as e:
                # 如果文件无法读取，记录错误但继续处理其他文件
                pass
    # 准备响应
    zip_buffer.seek(0)
    response = HttpResponse(zip_buffer.getvalue(), content_type='application/zip')
    filename = f"{content.title[:50]}.zip"
    response['Content-Disposition'] = f'attachment; filename="{filename}"'
    return response
--- a/requirements.txt
+++ b/requirements.txt
@@ -27,3 +27,4 @@ tzdata==2025.2
 urllib3==2.5.0
 vine==5.1.0
 wcwidth==0.2.14
 python-docx==1.2.0