From f15b730dcabd4139d020243a12caddc052378479 Mon Sep 17 00:00:00 2001 From: yuangyaa Date: Tue, 23 Sep 2025 15:01:36 +0800 Subject: [PATCH] Support download --- crawler/crawler_engine.py | 58 ++++++--- crawler/templates/crawler/dashboard.html | 155 ++++++++++++++++------- crawler/urls.py | 1 + crawler/views.py | 77 +++++++++++ 4 files changed, 227 insertions(+), 64 deletions(-) diff --git a/crawler/crawler_engine.py b/crawler/crawler_engine.py index 4763ad8..29433fd 100644 --- a/crawler/crawler_engine.py +++ b/crawler/crawler_engine.py @@ -461,27 +461,51 @@ class WebsiteCrawler: publish_date = self.extract_publish_date(article_soup) author = self.extract_author(article_soup) - # 保存内容 - crawled_content = CrawledContent.objects.create( - task=self.task, - website=website, - title=title, - content=content, + # 检查是否已存在相同URL的文章 + existing_content = CrawledContent.objects.filter( url=link_info['url'], - publish_date=publish_date, - author=author, - keywords_matched=','.join(matched_keywords), - is_local_saved=False # 初始设置为False,保存到本地后会更新为True - ) + task=self.task + ).first() - # 提取并下载媒体文件 - media_files = self.extract_and_download_media(article_soup, crawled_content, link_info['url']) - - # 标记内容已保存 - self.mark_content_saved(crawled_content) + if existing_content: + # 如果已存在,更新现有记录而不是创建新记录 + existing_content.title = title + existing_content.content = content + existing_content.publish_date = publish_date + existing_content.author = author + existing_content.keywords_matched = ','.join(matched_keywords) + existing_content.save() + + # 更新媒体文件 + # 先删除旧的媒体文件 + existing_content.media_files.all().delete() + # 然后重新下载媒体文件 + media_files = self.extract_and_download_media(article_soup, existing_content, link_info['url']) + + self.log('info', f'更新已存在的文章: {title[:50]}...', website) + else: + # 保存新内容 + crawled_content = CrawledContent.objects.create( + task=self.task, + website=website, + title=title, + content=content, + url=link_info['url'], + publish_date=publish_date, + author=author, + keywords_matched=','.join(matched_keywords), + is_local_saved=False # 初始设置为False,保存到本地后会更新为True + ) + + # 提取并下载媒体文件 + media_files = self.extract_and_download_media(article_soup, crawled_content, link_info['url']) + + # 标记内容已保存 + self.mark_content_saved(crawled_content) + + self.log('info', f'保存新文章: {title[:50]}...', website) crawled_count += 1 - self.log('info', f'保存文章: {title[:50]}...', website) # 请求间隔 time.sleep(settings.CRAWLER_SETTINGS['REQUEST_DELAY']) diff --git a/crawler/templates/crawler/dashboard.html b/crawler/templates/crawler/dashboard.html index 5114a79..c356159 100644 --- a/crawler/templates/crawler/dashboard.html +++ b/crawler/templates/crawler/dashboard.html @@ -151,61 +151,80 @@
- {% for website_name, contents in stats.contents_by_website.items %} -
-
-
- {{ website_name }} - {{ contents|length }} -
-
+
+ {% csrf_token %} + +
-
- {% for content in contents %} - + + {% for website_name, contents in stats.contents_by_website.items %} +
+
+
+ {{ website_name }} + {{ contents|length }} +
+
+
+
+ {% for content in contents %} +
+ +

{{ content.content|truncatechars:100 }}

+
+ + {{ content.website.region }} + {% if content.media_files.count > 0 %} + | {{ content.media_files.count }} 个媒体文件 + {% endif %} + +
+ + - {% else %} - - {{ content.title|truncatechars:60 }} - - {% endif %} - - {{ content.created_at|date:"m-d H:i" }} -
-

{{ content.content|truncatechars:100 }}

-
- - {{ content.website.region }} - {% if content.media_files.count > 0 %} - | {{ content.media_files.count }} 个媒体文件 - {% endif %} - -
- - - - {% for keyword in content.keywords_matched|split:"," %} - {{ keyword|strip }} - {% endfor %} + {% for keyword in content.keywords_matched|split:"," %} + {{ keyword|strip }} + {% endfor %} +
+ {% endfor %}
- {% endfor %}
-
- {% empty %} -
-
-

暂无爬取内容

+ {% empty %} +
+
+

暂无爬取内容

+
-
- {% endfor %} + {% endfor %} + {% if stats.page_obj.has_other_pages %} @@ -320,4 +339,46 @@
+{% endblock %} + +{% block extra_js %} + {% endblock %} \ No newline at end of file diff --git a/crawler/urls.py b/crawler/urls.py index 03bb32b..64ed498 100644 --- a/crawler/urls.py +++ b/crawler/urls.py @@ -6,4 +6,5 @@ urlpatterns = [ path('search/', views.search_page, name='search'), path('crawled-content//preview/', views.preview_crawled_content, name='preview_crawled_content'), path('crawled-content//download/', views.download_crawled_content, name='download_crawled_content'), + path('crawled-content/download-selected/', views.download_selected_contents, name='download_selected_contents'), ] \ No newline at end of file diff --git a/crawler/views.py b/crawler/views.py index 60ec1ba..7c2dd0d 100644 --- a/crawler/views.py +++ b/crawler/views.py @@ -355,3 +355,80 @@ def download_crawled_content(request, content_id): response['Content-Disposition'] = f'attachment; filename="{filename}"' return response + + +def download_selected_contents(request): + """下载选中的多篇文章内容为一个压缩包""" + if request.method == 'POST': + # 获取选中的文章ID + selected_ids = request.POST.getlist('selected_contents') + + if not selected_ids: + # 如果没有选中任何文章,返回错误 + return HttpResponse("请至少选择一篇文章", status=400) + + # 获取选中的文章 + contents = CrawledContent.objects.filter(id__in=selected_ids) + + if not contents.exists(): + return HttpResponse("未找到选中的文章", status=404) + + # 创建内存中的字节流用于存储zip文件 + zip_buffer = BytesIO() + + # 创建zip文件 + with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file: + for content in contents: + # 为每篇文章创建一个文件夹 + folder_name = f"{content.title[:30].strip()}" + # 确保文件夹名称合法 + folder_name = "".join(c for c in folder_name if c.isalnum() or c in (' ', '-', '_')).rstrip() + + # 创建Word文档 + doc = Document() + doc.add_heading(content.title, 0) + + # 添加元数据 + doc.add_paragraph(f'来源网站: {content.website.name} ({content.website.region})') + doc.add_paragraph(f'原始链接: {content.url}') + doc.add_paragraph(f'发布时间: {content.publish_date or "未知"}') + doc.add_paragraph(f'作者: {content.author or "未知"}') + doc.add_paragraph(f'匹配关键字: {content.keywords_matched}') + doc.add_paragraph(f'爬取时间: {content.created_at}') + + # 添加内容 + doc.add_heading('正文', level=1) + for paragraph in content.content.split('\n\n'): + if paragraph.strip(): + doc.add_paragraph(paragraph.strip()) + + # 保存Word文档到内存 + doc_buffer = BytesIO() + doc.save(doc_buffer) + doc_buffer.seek(0) + + # 添加Word文档到zip文件 + zip_file.writestr(f"{folder_name}/{content.title[:50]}.docx", doc_buffer.getvalue()) + + # 添加媒体文件到zip文件 + media_files = content.media_files.all() + for media_file in media_files: + try: + # 获取媒体文件的本地路径 + if media_file.local_file and default_storage.exists(media_file.local_file.name): + # 读取文件内容 + file_content = default_storage.open(media_file.local_file.name).read() + # 添加到zip文件中 + zip_file.writestr(f"{folder_name}/media/{os.path.basename(media_file.local_file.name)}", file_content) + except Exception as e: + # 如果文件无法读取,记录错误但继续处理其他文件 + pass + + # 准备响应 + zip_buffer.seek(0) + response = HttpResponse(zip_buffer.getvalue(), content_type='application/zip') + response['Content-Disposition'] = 'attachment; filename="selected_articles.zip"' + + return response + + return HttpResponse("无效的请求方法", status=405)