From f15b730dcabd4139d020243a12caddc052378479 Mon Sep 17 00:00:00 2001
From: yuangyaa <yuangyaa@163.com>
Date: Tue, 23 Sep 2025 15:01:36 +0800
Subject: [PATCH] Support download

---
 crawler/crawler_engine.py                |  58 ++++++---
 crawler/templates/crawler/dashboard.html | 155 ++++++++++++++++-------
 crawler/urls.py                          |   1 +
 crawler/views.py                         |  77 +++++++++++
 4 files changed, 227 insertions(+), 64 deletions(-)

diff --git a/crawler/crawler_engine.py b/crawler/crawler_engine.py
index 4763ad8..29433fd 100644
--- a/crawler/crawler_engine.py
+++ b/crawler/crawler_engine.py
@@ -461,27 +461,51 @@ class WebsiteCrawler:
                         publish_date = self.extract_publish_date(article_soup)
                         author = self.extract_author(article_soup)
                         
-                        # 保存内容
-                        crawled_content = CrawledContent.objects.create(
-                            task=self.task,
-                            website=website,
-                            title=title,
-                            content=content,
+                        # 检查是否已存在相同URL的文章
+                        existing_content = CrawledContent.objects.filter(
                             url=link_info['url'],
-                            publish_date=publish_date,
-                            author=author,
-                            keywords_matched=','.join(matched_keywords),
-                            is_local_saved=False  # 初始设置为False，保存到本地后会更新为True
-                        )
+                            task=self.task
+                        ).first()
                         
-                        # 提取并下载媒体文件
-                        media_files = self.extract_and_download_media(article_soup, crawled_content, link_info['url'])
-                        
-                        # 标记内容已保存
-                        self.mark_content_saved(crawled_content)
+                        if existing_content:
+                            # 如果已存在，更新现有记录而不是创建新记录
+                            existing_content.title = title
+                            existing_content.content = content
+                            existing_content.publish_date = publish_date
+                            existing_content.author = author
+                            existing_content.keywords_matched = ','.join(matched_keywords)
+                            existing_content.save()
+                            
+                            # 更新媒体文件
+                            # 先删除旧的媒体文件
+                            existing_content.media_files.all().delete()
+                            # 然后重新下载媒体文件
+                            media_files = self.extract_and_download_media(article_soup, existing_content, link_info['url'])
+                            
+                            self.log('info', f'更新已存在的文章: {title[:50]}...', website)
+                        else:
+                            # 保存新内容
+                            crawled_content = CrawledContent.objects.create(
+                                task=self.task,
+                                website=website,
+                                title=title,
+                                content=content,
+                                url=link_info['url'],
+                                publish_date=publish_date,
+                                author=author,
+                                keywords_matched=','.join(matched_keywords),
+                                is_local_saved=False  # 初始设置为False，保存到本地后会更新为True
+                            )
+                            
+                            # 提取并下载媒体文件
+                            media_files = self.extract_and_download_media(article_soup, crawled_content, link_info['url'])
+                            
+                            # 标记内容已保存
+                            self.mark_content_saved(crawled_content)
+                            
+                            self.log('info', f'保存新文章: {title[:50]}...', website)
                         
                         crawled_count += 1
-                        self.log('info', f'保存文章: {title[:50]}...', website)
                     
                     # 请求间隔
                     time.sleep(settings.CRAWLER_SETTINGS['REQUEST_DELAY'])
diff --git a/crawler/templates/crawler/dashboard.html b/crawler/templates/crawler/dashboard.html
index 5114a79..c356159 100644
--- a/crawler/templates/crawler/dashboard.html
+++ b/crawler/templates/crawler/dashboard.html
@@ -151,61 +151,80 @@
     
     <!-- 按网站分类显示内容 -->
     <div class="col-md-8">
-        {% for website_name, contents in stats.contents_by_website.items %}
-            <div class="card mb-4">
-                <div class="card-header">
-                    <h5 class="card-title mb-0">
-                        <i class="bi bi-globe"></i> {{ website_name }}
-                        <span class="badge bg-secondary">{{ contents|length }}</span>
-                    </h5>
-                </div>
+        <form id="download-form" method="post" action="{% url 'download_selected_contents' %}">
+            {% csrf_token %}
+            <!-- 批量操作按钮 -->
+            <div class="card mb-3">
                 <div class="card-body">
-                    <div class="list-group list-group-flush">
-                        {% for content in contents %}
-                        <div class="list-group-item">
-                            <div class="d-flex w-100 justify-content-between">
-                                <h6 class="mb-1">
-                                    {% if content.is_local_saved %}
-                                        <a href="{% url 'preview_crawled_content' content.id %}" target="_blank" class="text-decoration-none">
-                                            {{ content.title|truncatechars:60 }}
+                    <button type="submit" class="btn btn-primary" id="download-selected" disabled>
+                        <i class="bi bi-download"></i> 批量下载选中文章
+                    </button>
+                    <button type="button" class="btn btn-outline-secondary" id="select-all">
+                        <i class="bi bi-check-all"></i> 全选
+                    </button>
+                    <button type="button" class="btn btn-outline-secondary" id="deselect-all">
+                        <i class="bi bi-x-circle"></i> 取消全选
+                    </button>
+                </div>
+            </div>
+            
+            {% for website_name, contents in stats.contents_by_website.items %}
+                <div class="card mb-4">
+                    <div class="card-header">
+                        <h5 class="card-title mb-0">
+                            <i class="bi bi-globe"></i> {{ website_name }}
+                            <span class="badge bg-secondary">{{ contents|length }}</span>
+                        </h5>
+                    </div>
+                    <div class="card-body">
+                        <div class="list-group list-group-flush">
+                            {% for content in contents %}
+                            <div class="list-group-item">
+                                <div class="d-flex w-100 justify-content-between">
+                                    <h6 class="mb-1">
+                                        <input type="checkbox" name="selected_contents" value="{{ content.id }}" class="me-2 article-checkbox">
+                                        {% if content.is_local_saved %}
+                                            <a href="{% url 'preview_crawled_content' content.id %}" target="_blank" class="text-decoration-none">
+                                                {{ content.title|truncatechars:60 }}
+                                            </a>
+                                        {% else %}
+                                            <a href="{{ content.url }}" target="_blank" class="text-decoration-none">
+                                                {{ content.title|truncatechars:60 }}
+                                            </a>
+                                        {% endif %}
+                                    </h6>
+                                    <small class="text-muted">{{ content.created_at|date:"m-d H:i" }}</small>
+                                </div>
+                                <p class="mb-1 content-preview">{{ content.content|truncatechars:100 }}</p>
+                                <div class="d-flex justify-content-between align-items-center">
+                                    <small class="text-muted">
+                                        <i class="bi bi-geo-alt"></i> {{ content.website.region }}
+                                        {% if content.media_files.count > 0 %}
+                                            | <i class="bi bi-image"></i> {{ content.media_files.count }} 个媒体文件
+                                        {% endif %}
+                                    </small>
+                                    <div>
+                                        <a href="{% url 'download_crawled_content' content.id %}" class="btn btn-sm btn-outline-primary" title="下载">
+                                            <i class="bi bi-download"></i>
                                         </a>
-                                    {% else %}
-                                        <a href="{{ content.url }}" target="_blank" class="text-decoration-none">
-                                            {{ content.title|truncatechars:60 }}
-                                        </a>
-                                    {% endif %}
-                                </h6>
-                                <small class="text-muted">{{ content.created_at|date:"m-d H:i" }}</small>
-                            </div>
-                            <p class="mb-1 content-preview">{{ content.content|truncatechars:100 }}</p>
-                            <div class="d-flex justify-content-between align-items-center">
-                                <small class="text-muted">
-                                    <i class="bi bi-geo-alt"></i> {{ content.website.region }}
-                                    {% if content.media_files.count > 0 %}
-                                        | <i class="bi bi-image"></i> {{ content.media_files.count }} 个媒体文件
-                                    {% endif %}
-                                </small>
-                                <div>
-                                    <a href="{% url 'download_crawled_content' content.id %}" class="btn btn-sm btn-outline-primary" title="下载">
-                                        <i class="bi bi-download"></i>
-                                    </a>
-                                    {% for keyword in content.keywords_matched|split:"," %}
-                                        <span class="keyword-badge">{{ keyword|strip }}</span>
-                                    {% endfor %}
+                                        {% for keyword in content.keywords_matched|split:"," %}
+                                            <span class="keyword-badge">{{ keyword|strip }}</span>
+                                        {% endfor %}
+                                    </div>
                                 </div>
                             </div>
+                            {% endfor %}
                         </div>
-                        {% endfor %}
                     </div>
                 </div>
-            </div>
-        {% empty %}
-            <div class="card">
-                <div class="card-body text-center">
-                    <p class="text-muted py-3">暂无爬取内容</p>
+            {% empty %}
+                <div class="card">
+                    <div class="card-body text-center">
+                        <p class="text-muted py-3">暂无爬取内容</p>
+                    </div>
                 </div>
-            </div>
-        {% endfor %}
+            {% endfor %}
+        </form>
         
         <!-- 分页信息 -->
         {% if stats.page_obj.has_other_pages %}
@@ -320,4 +339,46 @@
         </div>
     </div>
 </div>
+{% endblock %}
+
+{% block extra_js %}
+<script>
+    // 更新选中文章数量显示和批量下载按钮状态
+    function updateSelectionStatus() {
+        const selectedCount = document.querySelectorAll('.article-checkbox:checked').length;
+        const downloadButton = document.getElementById('download-selected');
+        
+        if (selectedCount > 0) {
+            downloadButton.disabled = false;
+            downloadButton.innerHTML = `<i class="bi bi-download"></i> 批量下载 (${selectedCount})`;
+        } else {
+            downloadButton.disabled = true;
+            downloadButton.innerHTML = '<i class="bi bi-download"></i> 批量下载选中文章';
+        }
+    }
+    
+    // 全选功能
+    document.getElementById('select-all').addEventListener('click', function() {
+        document.querySelectorAll('.article-checkbox').forEach(checkbox => {
+            checkbox.checked = true;
+        });
+        updateSelectionStatus();
+    });
+    
+    // 取消全选功能
+    document.getElementById('deselect-all').addEventListener('click', function() {
+        document.querySelectorAll('.article-checkbox').forEach(checkbox => {
+            checkbox.checked = false;
+        });
+        updateSelectionStatus();
+    });
+    
+    // 监听复选框变化
+    document.querySelectorAll('.article-checkbox').forEach(checkbox => {
+        checkbox.addEventListener('change', updateSelectionStatus);
+    });
+    
+    // 初始化状态
+    updateSelectionStatus();
+</script>
 {% endblock %}
\ No newline at end of file
diff --git a/crawler/urls.py b/crawler/urls.py
index 03bb32b..64ed498 100644
--- a/crawler/urls.py
+++ b/crawler/urls.py
@@ -6,4 +6,5 @@ urlpatterns = [
     path('search/', views.search_page, name='search'),
     path('crawled-content/<int:content_id>/preview/', views.preview_crawled_content, name='preview_crawled_content'),
     path('crawled-content/<int:content_id>/download/', views.download_crawled_content, name='download_crawled_content'),
+    path('crawled-content/download-selected/', views.download_selected_contents, name='download_selected_contents'),
 ]
\ No newline at end of file
diff --git a/crawler/views.py b/crawler/views.py
index 60ec1ba..7c2dd0d 100644
--- a/crawler/views.py
+++ b/crawler/views.py
@@ -355,3 +355,80 @@ def download_crawled_content(request, content_id):
     response['Content-Disposition'] = f'attachment; filename="{filename}"'
     
     return response
+
+
+def download_selected_contents(request):
+    """下载选中的多篇文章内容为一个压缩包"""
+    if request.method == 'POST':
+        # 获取选中的文章ID
+        selected_ids = request.POST.getlist('selected_contents')
+        
+        if not selected_ids:
+            # 如果没有选中任何文章，返回错误
+            return HttpResponse("请至少选择一篇文章", status=400)
+        
+        # 获取选中的文章
+        contents = CrawledContent.objects.filter(id__in=selected_ids)
+        
+        if not contents.exists():
+            return HttpResponse("未找到选中的文章", status=404)
+        
+        # 创建内存中的字节流用于存储zip文件
+        zip_buffer = BytesIO()
+        
+        # 创建zip文件
+        with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
+            for content in contents:
+                # 为每篇文章创建一个文件夹
+                folder_name = f"{content.title[:30].strip()}"
+                # 确保文件夹名称合法
+                folder_name = "".join(c for c in folder_name if c.isalnum() or c in (' ', '-', '_')).rstrip()
+                
+                # 创建Word文档
+                doc = Document()
+                doc.add_heading(content.title, 0)
+                
+                # 添加元数据
+                doc.add_paragraph(f'来源网站: {content.website.name} ({content.website.region})')
+                doc.add_paragraph(f'原始链接: {content.url}')
+                doc.add_paragraph(f'发布时间: {content.publish_date or "未知"}')
+                doc.add_paragraph(f'作者: {content.author or "未知"}')
+                doc.add_paragraph(f'匹配关键字: {content.keywords_matched}')
+                doc.add_paragraph(f'爬取时间: {content.created_at}')
+                
+                # 添加内容
+                doc.add_heading('正文', level=1)
+                for paragraph in content.content.split('\n\n'):
+                    if paragraph.strip():
+                        doc.add_paragraph(paragraph.strip())
+                
+                # 保存Word文档到内存
+                doc_buffer = BytesIO()
+                doc.save(doc_buffer)
+                doc_buffer.seek(0)
+                
+                # 添加Word文档到zip文件
+                zip_file.writestr(f"{folder_name}/{content.title[:50]}.docx", doc_buffer.getvalue())
+                
+                # 添加媒体文件到zip文件
+                media_files = content.media_files.all()
+                for media_file in media_files:
+                    try:
+                        # 获取媒体文件的本地路径
+                        if media_file.local_file and default_storage.exists(media_file.local_file.name):
+                            # 读取文件内容
+                            file_content = default_storage.open(media_file.local_file.name).read()
+                            # 添加到zip文件中
+                            zip_file.writestr(f"{folder_name}/media/{os.path.basename(media_file.local_file.name)}", file_content)
+                    except Exception as e:
+                        # 如果文件无法读取，记录错误但继续处理其他文件
+                        pass
+        
+        # 准备响应
+        zip_buffer.seek(0)
+        response = HttpResponse(zip_buffer.getvalue(), content_type='application/zip')
+        response['Content-Disposition'] = 'attachment; filename="selected_articles.zip"'
+        
+        return response
+    
+    return HttpResponse("无效的请求方法", status=405)