Support download
This commit is contained in:
@@ -461,27 +461,51 @@ class WebsiteCrawler:
|
||||
publish_date = self.extract_publish_date(article_soup)
|
||||
author = self.extract_author(article_soup)
|
||||
|
||||
# 保存内容
|
||||
crawled_content = CrawledContent.objects.create(
|
||||
task=self.task,
|
||||
website=website,
|
||||
title=title,
|
||||
content=content,
|
||||
# 检查是否已存在相同URL的文章
|
||||
existing_content = CrawledContent.objects.filter(
|
||||
url=link_info['url'],
|
||||
publish_date=publish_date,
|
||||
author=author,
|
||||
keywords_matched=','.join(matched_keywords),
|
||||
is_local_saved=False # 初始设置为False,保存到本地后会更新为True
|
||||
)
|
||||
task=self.task
|
||||
).first()
|
||||
|
||||
# 提取并下载媒体文件
|
||||
media_files = self.extract_and_download_media(article_soup, crawled_content, link_info['url'])
|
||||
if existing_content:
|
||||
# 如果已存在,更新现有记录而不是创建新记录
|
||||
existing_content.title = title
|
||||
existing_content.content = content
|
||||
existing_content.publish_date = publish_date
|
||||
existing_content.author = author
|
||||
existing_content.keywords_matched = ','.join(matched_keywords)
|
||||
existing_content.save()
|
||||
|
||||
# 标记内容已保存
|
||||
self.mark_content_saved(crawled_content)
|
||||
# 更新媒体文件
|
||||
# 先删除旧的媒体文件
|
||||
existing_content.media_files.all().delete()
|
||||
# 然后重新下载媒体文件
|
||||
media_files = self.extract_and_download_media(article_soup, existing_content, link_info['url'])
|
||||
|
||||
self.log('info', f'更新已存在的文章: {title[:50]}...', website)
|
||||
else:
|
||||
# 保存新内容
|
||||
crawled_content = CrawledContent.objects.create(
|
||||
task=self.task,
|
||||
website=website,
|
||||
title=title,
|
||||
content=content,
|
||||
url=link_info['url'],
|
||||
publish_date=publish_date,
|
||||
author=author,
|
||||
keywords_matched=','.join(matched_keywords),
|
||||
is_local_saved=False # 初始设置为False,保存到本地后会更新为True
|
||||
)
|
||||
|
||||
# 提取并下载媒体文件
|
||||
media_files = self.extract_and_download_media(article_soup, crawled_content, link_info['url'])
|
||||
|
||||
# 标记内容已保存
|
||||
self.mark_content_saved(crawled_content)
|
||||
|
||||
self.log('info', f'保存新文章: {title[:50]}...', website)
|
||||
|
||||
crawled_count += 1
|
||||
self.log('info', f'保存文章: {title[:50]}...', website)
|
||||
|
||||
# 请求间隔
|
||||
time.sleep(settings.CRAWLER_SETTINGS['REQUEST_DELAY'])
|
||||
|
||||
@@ -151,61 +151,80 @@
|
||||
|
||||
<!-- 按网站分类显示内容 -->
|
||||
<div class="col-md-8">
|
||||
{% for website_name, contents in stats.contents_by_website.items %}
|
||||
<div class="card mb-4">
|
||||
<div class="card-header">
|
||||
<h5 class="card-title mb-0">
|
||||
<i class="bi bi-globe"></i> {{ website_name }}
|
||||
<span class="badge bg-secondary">{{ contents|length }}</span>
|
||||
</h5>
|
||||
</div>
|
||||
<form id="download-form" method="post" action="{% url 'download_selected_contents' %}">
|
||||
{% csrf_token %}
|
||||
<!-- 批量操作按钮 -->
|
||||
<div class="card mb-3">
|
||||
<div class="card-body">
|
||||
<div class="list-group list-group-flush">
|
||||
{% for content in contents %}
|
||||
<div class="list-group-item">
|
||||
<div class="d-flex w-100 justify-content-between">
|
||||
<h6 class="mb-1">
|
||||
{% if content.is_local_saved %}
|
||||
<a href="{% url 'preview_crawled_content' content.id %}" target="_blank" class="text-decoration-none">
|
||||
{{ content.title|truncatechars:60 }}
|
||||
<button type="submit" class="btn btn-primary" id="download-selected" disabled>
|
||||
<i class="bi bi-download"></i> 批量下载选中文章
|
||||
</button>
|
||||
<button type="button" class="btn btn-outline-secondary" id="select-all">
|
||||
<i class="bi bi-check-all"></i> 全选
|
||||
</button>
|
||||
<button type="button" class="btn btn-outline-secondary" id="deselect-all">
|
||||
<i class="bi bi-x-circle"></i> 取消全选
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{% for website_name, contents in stats.contents_by_website.items %}
|
||||
<div class="card mb-4">
|
||||
<div class="card-header">
|
||||
<h5 class="card-title mb-0">
|
||||
<i class="bi bi-globe"></i> {{ website_name }}
|
||||
<span class="badge bg-secondary">{{ contents|length }}</span>
|
||||
</h5>
|
||||
</div>
|
||||
<div class="card-body">
|
||||
<div class="list-group list-group-flush">
|
||||
{% for content in contents %}
|
||||
<div class="list-group-item">
|
||||
<div class="d-flex w-100 justify-content-between">
|
||||
<h6 class="mb-1">
|
||||
<input type="checkbox" name="selected_contents" value="{{ content.id }}" class="me-2 article-checkbox">
|
||||
{% if content.is_local_saved %}
|
||||
<a href="{% url 'preview_crawled_content' content.id %}" target="_blank" class="text-decoration-none">
|
||||
{{ content.title|truncatechars:60 }}
|
||||
</a>
|
||||
{% else %}
|
||||
<a href="{{ content.url }}" target="_blank" class="text-decoration-none">
|
||||
{{ content.title|truncatechars:60 }}
|
||||
</a>
|
||||
{% endif %}
|
||||
</h6>
|
||||
<small class="text-muted">{{ content.created_at|date:"m-d H:i" }}</small>
|
||||
</div>
|
||||
<p class="mb-1 content-preview">{{ content.content|truncatechars:100 }}</p>
|
||||
<div class="d-flex justify-content-between align-items-center">
|
||||
<small class="text-muted">
|
||||
<i class="bi bi-geo-alt"></i> {{ content.website.region }}
|
||||
{% if content.media_files.count > 0 %}
|
||||
| <i class="bi bi-image"></i> {{ content.media_files.count }} 个媒体文件
|
||||
{% endif %}
|
||||
</small>
|
||||
<div>
|
||||
<a href="{% url 'download_crawled_content' content.id %}" class="btn btn-sm btn-outline-primary" title="下载">
|
||||
<i class="bi bi-download"></i>
|
||||
</a>
|
||||
{% else %}
|
||||
<a href="{{ content.url }}" target="_blank" class="text-decoration-none">
|
||||
{{ content.title|truncatechars:60 }}
|
||||
</a>
|
||||
{% endif %}
|
||||
</h6>
|
||||
<small class="text-muted">{{ content.created_at|date:"m-d H:i" }}</small>
|
||||
</div>
|
||||
<p class="mb-1 content-preview">{{ content.content|truncatechars:100 }}</p>
|
||||
<div class="d-flex justify-content-between align-items-center">
|
||||
<small class="text-muted">
|
||||
<i class="bi bi-geo-alt"></i> {{ content.website.region }}
|
||||
{% if content.media_files.count > 0 %}
|
||||
| <i class="bi bi-image"></i> {{ content.media_files.count }} 个媒体文件
|
||||
{% endif %}
|
||||
</small>
|
||||
<div>
|
||||
<a href="{% url 'download_crawled_content' content.id %}" class="btn btn-sm btn-outline-primary" title="下载">
|
||||
<i class="bi bi-download"></i>
|
||||
</a>
|
||||
{% for keyword in content.keywords_matched|split:"," %}
|
||||
<span class="keyword-badge">{{ keyword|strip }}</span>
|
||||
{% endfor %}
|
||||
{% for keyword in content.keywords_matched|split:"," %}
|
||||
<span class="keyword-badge">{{ keyword|strip }}</span>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{% empty %}
|
||||
<div class="card">
|
||||
<div class="card-body text-center">
|
||||
<p class="text-muted py-3">暂无爬取内容</p>
|
||||
{% empty %}
|
||||
<div class="card">
|
||||
<div class="card-body text-center">
|
||||
<p class="text-muted py-3">暂无爬取内容</p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
</form>
|
||||
|
||||
<!-- 分页信息 -->
|
||||
{% if stats.page_obj.has_other_pages %}
|
||||
@@ -321,3 +340,45 @@
|
||||
</div>
|
||||
</div>
|
||||
{% endblock %}
|
||||
|
||||
{% block extra_js %}
|
||||
<script>
|
||||
// 更新选中文章数量显示和批量下载按钮状态
|
||||
function updateSelectionStatus() {
|
||||
const selectedCount = document.querySelectorAll('.article-checkbox:checked').length;
|
||||
const downloadButton = document.getElementById('download-selected');
|
||||
|
||||
if (selectedCount > 0) {
|
||||
downloadButton.disabled = false;
|
||||
downloadButton.innerHTML = `<i class="bi bi-download"></i> 批量下载 (${selectedCount})`;
|
||||
} else {
|
||||
downloadButton.disabled = true;
|
||||
downloadButton.innerHTML = '<i class="bi bi-download"></i> 批量下载选中文章';
|
||||
}
|
||||
}
|
||||
|
||||
// 全选功能
|
||||
document.getElementById('select-all').addEventListener('click', function() {
|
||||
document.querySelectorAll('.article-checkbox').forEach(checkbox => {
|
||||
checkbox.checked = true;
|
||||
});
|
||||
updateSelectionStatus();
|
||||
});
|
||||
|
||||
// 取消全选功能
|
||||
document.getElementById('deselect-all').addEventListener('click', function() {
|
||||
document.querySelectorAll('.article-checkbox').forEach(checkbox => {
|
||||
checkbox.checked = false;
|
||||
});
|
||||
updateSelectionStatus();
|
||||
});
|
||||
|
||||
// 监听复选框变化
|
||||
document.querySelectorAll('.article-checkbox').forEach(checkbox => {
|
||||
checkbox.addEventListener('change', updateSelectionStatus);
|
||||
});
|
||||
|
||||
// 初始化状态
|
||||
updateSelectionStatus();
|
||||
</script>
|
||||
{% endblock %}
|
||||
@@ -6,4 +6,5 @@ urlpatterns = [
|
||||
path('search/', views.search_page, name='search'),
|
||||
path('crawled-content/<int:content_id>/preview/', views.preview_crawled_content, name='preview_crawled_content'),
|
||||
path('crawled-content/<int:content_id>/download/', views.download_crawled_content, name='download_crawled_content'),
|
||||
path('crawled-content/download-selected/', views.download_selected_contents, name='download_selected_contents'),
|
||||
]
|
||||
@@ -355,3 +355,80 @@ def download_crawled_content(request, content_id):
|
||||
response['Content-Disposition'] = f'attachment; filename="{filename}"'
|
||||
|
||||
return response
|
||||
|
||||
|
||||
def download_selected_contents(request):
|
||||
"""下载选中的多篇文章内容为一个压缩包"""
|
||||
if request.method == 'POST':
|
||||
# 获取选中的文章ID
|
||||
selected_ids = request.POST.getlist('selected_contents')
|
||||
|
||||
if not selected_ids:
|
||||
# 如果没有选中任何文章,返回错误
|
||||
return HttpResponse("请至少选择一篇文章", status=400)
|
||||
|
||||
# 获取选中的文章
|
||||
contents = CrawledContent.objects.filter(id__in=selected_ids)
|
||||
|
||||
if not contents.exists():
|
||||
return HttpResponse("未找到选中的文章", status=404)
|
||||
|
||||
# 创建内存中的字节流用于存储zip文件
|
||||
zip_buffer = BytesIO()
|
||||
|
||||
# 创建zip文件
|
||||
with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
|
||||
for content in contents:
|
||||
# 为每篇文章创建一个文件夹
|
||||
folder_name = f"{content.title[:30].strip()}"
|
||||
# 确保文件夹名称合法
|
||||
folder_name = "".join(c for c in folder_name if c.isalnum() or c in (' ', '-', '_')).rstrip()
|
||||
|
||||
# 创建Word文档
|
||||
doc = Document()
|
||||
doc.add_heading(content.title, 0)
|
||||
|
||||
# 添加元数据
|
||||
doc.add_paragraph(f'来源网站: {content.website.name} ({content.website.region})')
|
||||
doc.add_paragraph(f'原始链接: {content.url}')
|
||||
doc.add_paragraph(f'发布时间: {content.publish_date or "未知"}')
|
||||
doc.add_paragraph(f'作者: {content.author or "未知"}')
|
||||
doc.add_paragraph(f'匹配关键字: {content.keywords_matched}')
|
||||
doc.add_paragraph(f'爬取时间: {content.created_at}')
|
||||
|
||||
# 添加内容
|
||||
doc.add_heading('正文', level=1)
|
||||
for paragraph in content.content.split('\n\n'):
|
||||
if paragraph.strip():
|
||||
doc.add_paragraph(paragraph.strip())
|
||||
|
||||
# 保存Word文档到内存
|
||||
doc_buffer = BytesIO()
|
||||
doc.save(doc_buffer)
|
||||
doc_buffer.seek(0)
|
||||
|
||||
# 添加Word文档到zip文件
|
||||
zip_file.writestr(f"{folder_name}/{content.title[:50]}.docx", doc_buffer.getvalue())
|
||||
|
||||
# 添加媒体文件到zip文件
|
||||
media_files = content.media_files.all()
|
||||
for media_file in media_files:
|
||||
try:
|
||||
# 获取媒体文件的本地路径
|
||||
if media_file.local_file and default_storage.exists(media_file.local_file.name):
|
||||
# 读取文件内容
|
||||
file_content = default_storage.open(media_file.local_file.name).read()
|
||||
# 添加到zip文件中
|
||||
zip_file.writestr(f"{folder_name}/media/{os.path.basename(media_file.local_file.name)}", file_content)
|
||||
except Exception as e:
|
||||
# 如果文件无法读取,记录错误但继续处理其他文件
|
||||
pass
|
||||
|
||||
# 准备响应
|
||||
zip_buffer.seek(0)
|
||||
response = HttpResponse(zip_buffer.getvalue(), content_type='application/zip')
|
||||
response['Content-Disposition'] = 'attachment; filename="selected_articles.zip"'
|
||||
|
||||
return response
|
||||
|
||||
return HttpResponse("无效的请求方法", status=405)
|
||||
|
||||
Reference in New Issue
Block a user