Support download

This commit is contained in:
2025-09-23 15:01:36 +08:00
parent 45c005687d
commit f15b730dca
4 changed files with 227 additions and 64 deletions

View File

@@ -461,27 +461,51 @@ class WebsiteCrawler:
publish_date = self.extract_publish_date(article_soup)
author = self.extract_author(article_soup)
# 保存内容
crawled_content = CrawledContent.objects.create(
task=self.task,
website=website,
title=title,
content=content,
# 检查是否已存在相同URL的文章
existing_content = CrawledContent.objects.filter(
url=link_info['url'],
publish_date=publish_date,
author=author,
keywords_matched=','.join(matched_keywords),
is_local_saved=False # 初始设置为False保存到本地后会更新为True
)
task=self.task
).first()
# 提取并下载媒体文件
media_files = self.extract_and_download_media(article_soup, crawled_content, link_info['url'])
if existing_content:
# 如果已存在,更新现有记录而不是创建新记录
existing_content.title = title
existing_content.content = content
existing_content.publish_date = publish_date
existing_content.author = author
existing_content.keywords_matched = ','.join(matched_keywords)
existing_content.save()
# 标记内容已保存
self.mark_content_saved(crawled_content)
# 更新媒体文件
# 先删除旧的媒体文件
existing_content.media_files.all().delete()
# 然后重新下载媒体文件
media_files = self.extract_and_download_media(article_soup, existing_content, link_info['url'])
self.log('info', f'更新已存在的文章: {title[:50]}...', website)
else:
# 保存新内容
crawled_content = CrawledContent.objects.create(
task=self.task,
website=website,
title=title,
content=content,
url=link_info['url'],
publish_date=publish_date,
author=author,
keywords_matched=','.join(matched_keywords),
is_local_saved=False # 初始设置为False保存到本地后会更新为True
)
# 提取并下载媒体文件
media_files = self.extract_and_download_media(article_soup, crawled_content, link_info['url'])
# 标记内容已保存
self.mark_content_saved(crawled_content)
self.log('info', f'保存新文章: {title[:50]}...', website)
crawled_count += 1
self.log('info', f'保存文章: {title[:50]}...', website)
# 请求间隔
time.sleep(settings.CRAWLER_SETTINGS['REQUEST_DELAY'])

View File

@@ -151,61 +151,80 @@
<!-- 按网站分类显示内容 -->
<div class="col-md-8">
{% for website_name, contents in stats.contents_by_website.items %}
<div class="card mb-4">
<div class="card-header">
<h5 class="card-title mb-0">
<i class="bi bi-globe"></i> {{ website_name }}
<span class="badge bg-secondary">{{ contents|length }}</span>
</h5>
</div>
<form id="download-form" method="post" action="{% url 'download_selected_contents' %}">
{% csrf_token %}
<!-- 批量操作按钮 -->
<div class="card mb-3">
<div class="card-body">
<div class="list-group list-group-flush">
{% for content in contents %}
<div class="list-group-item">
<div class="d-flex w-100 justify-content-between">
<h6 class="mb-1">
{% if content.is_local_saved %}
<a href="{% url 'preview_crawled_content' content.id %}" target="_blank" class="text-decoration-none">
{{ content.title|truncatechars:60 }}
<button type="submit" class="btn btn-primary" id="download-selected" disabled>
<i class="bi bi-download"></i> 批量下载选中文章
</button>
<button type="button" class="btn btn-outline-secondary" id="select-all">
<i class="bi bi-check-all"></i> 全选
</button>
<button type="button" class="btn btn-outline-secondary" id="deselect-all">
<i class="bi bi-x-circle"></i> 取消全选
</button>
</div>
</div>
{% for website_name, contents in stats.contents_by_website.items %}
<div class="card mb-4">
<div class="card-header">
<h5 class="card-title mb-0">
<i class="bi bi-globe"></i> {{ website_name }}
<span class="badge bg-secondary">{{ contents|length }}</span>
</h5>
</div>
<div class="card-body">
<div class="list-group list-group-flush">
{% for content in contents %}
<div class="list-group-item">
<div class="d-flex w-100 justify-content-between">
<h6 class="mb-1">
<input type="checkbox" name="selected_contents" value="{{ content.id }}" class="me-2 article-checkbox">
{% if content.is_local_saved %}
<a href="{% url 'preview_crawled_content' content.id %}" target="_blank" class="text-decoration-none">
{{ content.title|truncatechars:60 }}
</a>
{% else %}
<a href="{{ content.url }}" target="_blank" class="text-decoration-none">
{{ content.title|truncatechars:60 }}
</a>
{% endif %}
</h6>
<small class="text-muted">{{ content.created_at|date:"m-d H:i" }}</small>
</div>
<p class="mb-1 content-preview">{{ content.content|truncatechars:100 }}</p>
<div class="d-flex justify-content-between align-items-center">
<small class="text-muted">
<i class="bi bi-geo-alt"></i> {{ content.website.region }}
{% if content.media_files.count > 0 %}
| <i class="bi bi-image"></i> {{ content.media_files.count }} 个媒体文件
{% endif %}
</small>
<div>
<a href="{% url 'download_crawled_content' content.id %}" class="btn btn-sm btn-outline-primary" title="下载">
<i class="bi bi-download"></i>
</a>
{% else %}
<a href="{{ content.url }}" target="_blank" class="text-decoration-none">
{{ content.title|truncatechars:60 }}
</a>
{% endif %}
</h6>
<small class="text-muted">{{ content.created_at|date:"m-d H:i" }}</small>
</div>
<p class="mb-1 content-preview">{{ content.content|truncatechars:100 }}</p>
<div class="d-flex justify-content-between align-items-center">
<small class="text-muted">
<i class="bi bi-geo-alt"></i> {{ content.website.region }}
{% if content.media_files.count > 0 %}
| <i class="bi bi-image"></i> {{ content.media_files.count }} 个媒体文件
{% endif %}
</small>
<div>
<a href="{% url 'download_crawled_content' content.id %}" class="btn btn-sm btn-outline-primary" title="下载">
<i class="bi bi-download"></i>
</a>
{% for keyword in content.keywords_matched|split:"," %}
<span class="keyword-badge">{{ keyword|strip }}</span>
{% endfor %}
{% for keyword in content.keywords_matched|split:"," %}
<span class="keyword-badge">{{ keyword|strip }}</span>
{% endfor %}
</div>
</div>
</div>
{% endfor %}
</div>
{% endfor %}
</div>
</div>
</div>
{% empty %}
<div class="card">
<div class="card-body text-center">
<p class="text-muted py-3">暂无爬取内容</p>
{% empty %}
<div class="card">
<div class="card-body text-center">
<p class="text-muted py-3">暂无爬取内容</p>
</div>
</div>
</div>
{% endfor %}
{% endfor %}
</form>
<!-- 分页信息 -->
{% if stats.page_obj.has_other_pages %}
@@ -321,3 +340,45 @@
</div>
</div>
{% endblock %}
{% block extra_js %}
<script>
// 更新选中文章数量显示和批量下载按钮状态
function updateSelectionStatus() {
const selectedCount = document.querySelectorAll('.article-checkbox:checked').length;
const downloadButton = document.getElementById('download-selected');
if (selectedCount > 0) {
downloadButton.disabled = false;
downloadButton.innerHTML = `<i class="bi bi-download"></i> 批量下载 (${selectedCount})`;
} else {
downloadButton.disabled = true;
downloadButton.innerHTML = '<i class="bi bi-download"></i> 批量下载选中文章';
}
}
// 全选功能
document.getElementById('select-all').addEventListener('click', function() {
document.querySelectorAll('.article-checkbox').forEach(checkbox => {
checkbox.checked = true;
});
updateSelectionStatus();
});
// 取消全选功能
document.getElementById('deselect-all').addEventListener('click', function() {
document.querySelectorAll('.article-checkbox').forEach(checkbox => {
checkbox.checked = false;
});
updateSelectionStatus();
});
// 监听复选框变化
document.querySelectorAll('.article-checkbox').forEach(checkbox => {
checkbox.addEventListener('change', updateSelectionStatus);
});
// 初始化状态
updateSelectionStatus();
</script>
{% endblock %}

View File

@@ -6,4 +6,5 @@ urlpatterns = [
path('search/', views.search_page, name='search'),
path('crawled-content/<int:content_id>/preview/', views.preview_crawled_content, name='preview_crawled_content'),
path('crawled-content/<int:content_id>/download/', views.download_crawled_content, name='download_crawled_content'),
path('crawled-content/download-selected/', views.download_selected_contents, name='download_selected_contents'),
]

View File

@@ -355,3 +355,80 @@ def download_crawled_content(request, content_id):
response['Content-Disposition'] = f'attachment; filename="{filename}"'
return response
def download_selected_contents(request):
"""下载选中的多篇文章内容为一个压缩包"""
if request.method == 'POST':
# 获取选中的文章ID
selected_ids = request.POST.getlist('selected_contents')
if not selected_ids:
# 如果没有选中任何文章,返回错误
return HttpResponse("请至少选择一篇文章", status=400)
# 获取选中的文章
contents = CrawledContent.objects.filter(id__in=selected_ids)
if not contents.exists():
return HttpResponse("未找到选中的文章", status=404)
# 创建内存中的字节流用于存储zip文件
zip_buffer = BytesIO()
# 创建zip文件
with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
for content in contents:
# 为每篇文章创建一个文件夹
folder_name = f"{content.title[:30].strip()}"
# 确保文件夹名称合法
folder_name = "".join(c for c in folder_name if c.isalnum() or c in (' ', '-', '_')).rstrip()
# 创建Word文档
doc = Document()
doc.add_heading(content.title, 0)
# 添加元数据
doc.add_paragraph(f'来源网站: {content.website.name} ({content.website.region})')
doc.add_paragraph(f'原始链接: {content.url}')
doc.add_paragraph(f'发布时间: {content.publish_date or "未知"}')
doc.add_paragraph(f'作者: {content.author or "未知"}')
doc.add_paragraph(f'匹配关键字: {content.keywords_matched}')
doc.add_paragraph(f'爬取时间: {content.created_at}')
# 添加内容
doc.add_heading('正文', level=1)
for paragraph in content.content.split('\n\n'):
if paragraph.strip():
doc.add_paragraph(paragraph.strip())
# 保存Word文档到内存
doc_buffer = BytesIO()
doc.save(doc_buffer)
doc_buffer.seek(0)
# 添加Word文档到zip文件
zip_file.writestr(f"{folder_name}/{content.title[:50]}.docx", doc_buffer.getvalue())
# 添加媒体文件到zip文件
media_files = content.media_files.all()
for media_file in media_files:
try:
# 获取媒体文件的本地路径
if media_file.local_file and default_storage.exists(media_file.local_file.name):
# 读取文件内容
file_content = default_storage.open(media_file.local_file.name).read()
# 添加到zip文件中
zip_file.writestr(f"{folder_name}/media/{os.path.basename(media_file.local_file.name)}", file_content)
except Exception as e:
# 如果文件无法读取,记录错误但继续处理其他文件
pass
# 准备响应
zip_buffer.seek(0)
response = HttpResponse(zip_buffer.getvalue(), content_type='application/zip')
response['Content-Disposition'] = 'attachment; filename="selected_articles.zip"'
return response
return HttpResponse("无效的请求方法", status=405)