fix URLS error and support download file
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -174,3 +174,5 @@ cython_debug/
|
|||||||
# PyPI configuration file
|
# PyPI configuration file
|
||||||
.pypirc
|
.pypirc
|
||||||
|
|
||||||
|
|
||||||
|
media/
|
||||||
|
|||||||
0
celerybeat-schedule.db
Normal file
0
celerybeat-schedule.db
Normal file
@@ -83,7 +83,7 @@ class CrawledContentAdmin(admin.ModelAdmin):
|
|||||||
def preview_content(self, obj):
|
def preview_content(self, obj):
|
||||||
"""预览内容"""
|
"""预览内容"""
|
||||||
if obj.is_local_saved:
|
if obj.is_local_saved:
|
||||||
url = reverse('admin:crawled_content_preview', args=[obj.id])
|
url = reverse('preview_crawled_content', args=[obj.id])
|
||||||
return format_html(
|
return format_html(
|
||||||
'<a href="{}" target="_blank" class="button">预览文章</a>',
|
'<a href="{}" target="_blank" class="button">预览文章</a>',
|
||||||
url
|
url
|
||||||
|
|||||||
@@ -186,6 +186,9 @@
|
|||||||
{% endif %}
|
{% endif %}
|
||||||
</small>
|
</small>
|
||||||
<div>
|
<div>
|
||||||
|
<a href="{% url 'download_crawled_content' content.id %}" class="btn btn-sm btn-outline-primary" title="下载">
|
||||||
|
<i class="bi bi-download"></i>
|
||||||
|
</a>
|
||||||
{% for keyword in content.keywords_matched|split:"," %}
|
{% for keyword in content.keywords_matched|split:"," %}
|
||||||
<span class="keyword-badge">{{ keyword|strip }}</span>
|
<span class="keyword-badge">{{ keyword|strip }}</span>
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
|
|||||||
@@ -5,4 +5,5 @@ urlpatterns = [
|
|||||||
path('', views.dashboard, name='dashboard'),
|
path('', views.dashboard, name='dashboard'),
|
||||||
path('search/', views.search_page, name='search'),
|
path('search/', views.search_page, name='search'),
|
||||||
path('crawled-content/<int:content_id>/preview/', views.preview_crawled_content, name='preview_crawled_content'),
|
path('crawled-content/<int:content_id>/preview/', views.preview_crawled_content, name='preview_crawled_content'),
|
||||||
|
path('crawled-content/<int:content_id>/download/', views.download_crawled_content, name='download_crawled_content'),
|
||||||
]
|
]
|
||||||
@@ -1,9 +1,10 @@
|
|||||||
from django.shortcuts import render, get_object_or_404
|
from django.shortcuts import render, get_object_or_404
|
||||||
from django.http import HttpResponse
|
from django.http import HttpResponse, Http404
|
||||||
from django.db.models import Q, Count
|
from django.db.models import Q, Count
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
from .models import Website, CrawlTask, CrawledContent, CrawlLog, SearchKeyword
|
from django.core.files.storage import default_storage
|
||||||
|
from .models import Website, CrawlTask, CrawledContent, CrawlLog, SearchKeyword, MediaFile
|
||||||
from rest_framework import viewsets, filters
|
from rest_framework import viewsets, filters
|
||||||
from rest_framework.decorators import action
|
from rest_framework.decorators import action
|
||||||
from rest_framework.response import Response
|
from rest_framework.response import Response
|
||||||
@@ -15,6 +16,12 @@ import json
|
|||||||
from django.core.paginator import Paginator
|
from django.core.paginator import Paginator
|
||||||
from django.db.models.functions import TruncDate
|
from django.db.models.functions import TruncDate
|
||||||
from django.db.models import Count
|
from django.db.models import Count
|
||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
import zipfile
|
||||||
|
from io import BytesIO
|
||||||
|
from docx import Document
|
||||||
|
from django.core.files.base import ContentFile
|
||||||
|
|
||||||
|
|
||||||
def dashboard(request):
|
def dashboard(request):
|
||||||
@@ -289,4 +296,62 @@ def preview_crawled_content(request, content_id):
|
|||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
"""
|
"""
|
||||||
return HttpResponse(html_content, content_type='text/html; charset=utf-8')
|
return HttpResponse(html_content, content_type='text/html; charset=utf-8')
|
||||||
|
|
||||||
|
|
||||||
|
def download_crawled_content(request, content_id):
|
||||||
|
"""下载文章内容为压缩包(包含Word文档和媒体文件)"""
|
||||||
|
content = get_object_or_404(CrawledContent, id=content_id)
|
||||||
|
|
||||||
|
# 创建内存中的字节流用于存储zip文件
|
||||||
|
zip_buffer = BytesIO()
|
||||||
|
|
||||||
|
# 创建zip文件
|
||||||
|
with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
|
||||||
|
# 创建Word文档
|
||||||
|
doc = Document()
|
||||||
|
doc.add_heading(content.title, 0)
|
||||||
|
|
||||||
|
# 添加元数据
|
||||||
|
doc.add_paragraph(f'来源网站: {content.website.name} ({content.website.region})')
|
||||||
|
doc.add_paragraph(f'原始链接: {content.url}')
|
||||||
|
doc.add_paragraph(f'发布时间: {content.publish_date or "未知"}')
|
||||||
|
doc.add_paragraph(f'作者: {content.author or "未知"}')
|
||||||
|
doc.add_paragraph(f'匹配关键字: {content.keywords_matched}')
|
||||||
|
doc.add_paragraph(f'爬取时间: {content.created_at}')
|
||||||
|
|
||||||
|
# 添加内容
|
||||||
|
doc.add_heading('正文', level=1)
|
||||||
|
for paragraph in content.content.split('\n\n'):
|
||||||
|
if paragraph.strip():
|
||||||
|
doc.add_paragraph(paragraph.strip())
|
||||||
|
|
||||||
|
# 保存Word文档到内存
|
||||||
|
doc_buffer = BytesIO()
|
||||||
|
doc.save(doc_buffer)
|
||||||
|
doc_buffer.seek(0)
|
||||||
|
|
||||||
|
# 添加Word文档到zip文件
|
||||||
|
zip_file.writestr(f"{content.title[:50]}.docx", doc_buffer.getvalue())
|
||||||
|
|
||||||
|
# 添加媒体文件到zip文件
|
||||||
|
media_files = content.media_files.all()
|
||||||
|
for media_file in media_files:
|
||||||
|
try:
|
||||||
|
# 获取媒体文件的本地路径
|
||||||
|
if media_file.local_file and default_storage.exists(media_file.local_file.name):
|
||||||
|
# 读取文件内容
|
||||||
|
file_content = default_storage.open(media_file.local_file.name).read()
|
||||||
|
# 添加到zip文件中
|
||||||
|
zip_file.writestr(f"media/{os.path.basename(media_file.local_file.name)}", file_content)
|
||||||
|
except Exception as e:
|
||||||
|
# 如果文件无法读取,记录错误但继续处理其他文件
|
||||||
|
pass
|
||||||
|
|
||||||
|
# 准备响应
|
||||||
|
zip_buffer.seek(0)
|
||||||
|
response = HttpResponse(zip_buffer.getvalue(), content_type='application/zip')
|
||||||
|
filename = f"{content.title[:50]}.zip"
|
||||||
|
response['Content-Disposition'] = f'attachment; filename="{filename}"'
|
||||||
|
|
||||||
|
return response
|
||||||
|
|||||||
@@ -27,3 +27,4 @@ tzdata==2025.2
|
|||||||
urllib3==2.5.0
|
urllib3==2.5.0
|
||||||
vine==5.1.0
|
vine==5.1.0
|
||||||
wcwidth==0.2.14
|
wcwidth==0.2.14
|
||||||
|
python-docx==1.2.0
|
||||||
Reference in New Issue
Block a user