diff --git a/core/admin.py b/core/admin.py index b44de9e..9e86878 100644 --- a/core/admin.py +++ b/core/admin.py @@ -34,7 +34,7 @@ def run_crawler_view(request): if not website_name: messages.error(request, '请选择要爬取的网站') return redirect('admin:core_article_changelist') - + try: # 根据网站名称确定要执行的爬虫命令 if website_name == 'crawl_xinhua': @@ -46,14 +46,14 @@ def run_crawler_view(request): else: # 对于其他网站,使用通用爬虫命令 crawler_name = 'crawl_articles' - + # 运行爬虫命令,不传递website_name作为参数 call_command(crawler_name) - + messages.success(request, f'成功执行爬虫: {crawler_name}') except Exception as e: messages.error(request, f'执行爬虫失败: {str(e)}') - + return redirect('admin:core_article_changelist') @@ -241,12 +241,12 @@ class ArticleAdmin(admin.ModelAdmin): # 创建内存中的ZIP文件 zip_buffer = BytesIO() - + with zipfile.ZipFile(zip_buffer, 'w') as zip_file: for article in queryset: # 为每篇文章创建单独的文件夹 article_folder = f"article_{article.id}_{article.title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('*', '_').replace('?', '_').replace('"', '_').replace('<', '_').replace('>', '_').replace('|', '_')}" - + # 创建Word文档 doc = Document() doc.add_heading(article.title, 0) @@ -281,7 +281,8 @@ class ArticleAdmin(admin.ModelAdmin): image_stream = BytesIO(response.content) doc.add_picture(image_stream, width=Inches(4.0)) # 将网络文件保存到ZIP - zip_file.writestr(os.path.join(article_folder, 'media', os.path.basename(src)), response.content) + zip_file.writestr(os.path.join(article_folder, 'media', os.path.basename(src)), + response.content) else: # 本地图片 full_path = os.path.join(settings.MEDIA_ROOT, src.lstrip('/')) @@ -310,7 +311,7 @@ class ArticleAdmin(admin.ModelAdmin): full_path = os.path.join(settings.MEDIA_ROOT, media_file) # 检查文件扩展名以确定处理方式 file_extension = os.path.splitext(media_file)[1].lower() - + # 图片文件处理 if file_extension in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']: if os.path.exists(full_path): @@ -325,7 +326,9 @@ class ArticleAdmin(admin.ModelAdmin): image_stream = BytesIO(response.content) doc.add_picture(image_stream, width=Inches(4.0)) # 将网络文件保存到ZIP - zip_file.writestr(os.path.join(article_folder, 'media', os.path.basename(media_file)), response.content) + zip_file.writestr( + os.path.join(article_folder, 'media', os.path.basename(media_file)), + response.content) else: doc.add_paragraph(media_file) # 视频文件处理 @@ -341,7 +344,9 @@ class ArticleAdmin(admin.ModelAdmin): if media_file.startswith('http'): # 将网络文件保存到ZIP response = requests.get(media_file, timeout=10) - zip_file.writestr(os.path.join(article_folder, 'media', os.path.basename(media_file)), response.content) + zip_file.writestr( + os.path.join(article_folder, 'media', os.path.basename(media_file)), + response.content) doc.add_paragraph(f"[视频文件: {media_file}]") else: doc.add_paragraph(media_file) @@ -355,7 +360,9 @@ class ArticleAdmin(admin.ModelAdmin): # 如果是URL格式的媒体文件 if media_file.startswith('http'): response = requests.get(media_file, timeout=10) - zip_file.writestr(os.path.join(article_folder, 'media', os.path.basename(media_file)), response.content) + zip_file.writestr( + os.path.join(article_folder, 'media', os.path.basename(media_file)), + response.content) doc.add_paragraph(f"[文件: {media_file}]") else: doc.add_paragraph(media_file) @@ -366,7 +373,8 @@ class ArticleAdmin(admin.ModelAdmin): doc_buffer = BytesIO() doc.save(doc_buffer) doc_buffer.seek(0) - zip_file.writestr(os.path.join(article_folder, f'{article.title.replace("/", "_")}.docx'), doc_buffer.read()) + zip_file.writestr(os.path.join(article_folder, f'{article.title.replace("/", "_")}.docx'), + doc_buffer.read()) # 创建HttpResponse zip_buffer.seek(0) @@ -512,5 +520,4 @@ class DongfangyancaoArticleAdmin(admin.ModelAdmin): export_as_json.short_description = "导出选中文章为JSON格式" - # 在各自的管理站点中注册模型 diff --git a/core/templates/admin/core/article/change_list.html b/core/templates/admin/core/article/change_list.html index 949e5f7..26554d0 100644 --- a/core/templates/admin/core/article/change_list.html +++ b/core/templates/admin/core/article/change_list.html @@ -13,7 +13,7 @@ - + {% endblock %} \ No newline at end of file diff --git a/core/templates/core/article_detail.html b/core/templates/core/article_detail.html index 649e54b..896eb2c 100644 --- a/core/templates/core/article_detail.html +++ b/core/templates/core/article_detail.html @@ -2,24 +2,23 @@ - {{ article.title }} + 绿色课堂文章列表 -
-

{{ article.title }}

-
-

发布时间: {{ article.pub_date|date:"Y-m-d H:i" }}

+
+

绿色课堂文章列表

+ + +
+
+ + {% if selected_website %} + + {% endif %} + +
-
-
- {{ article.content|safe }} + +
+ + + + +
+ + {% if search_query %} +
+ 搜索 "{{ search_query }}" 找到 {{ page_obj.paginator.count }} 篇文章 + 清除搜索 +
+ {% endif %} + + +
+ + + + + + +
+ +
    + {% for article in page_obj %} +
  • + + {{ article.title }} +
    ({{ article.website.name }} - {{ article.created_at|date:"Y-m-d" }})
    +
  • + {% empty %} +
  • 暂无文章
  • + {% endfor %} +
+ + +
-
-

← 返回列表

+ + \ No newline at end of file diff --git a/core/templates/core/article_list.html b/core/templates/core/article_list.html index 77d19bc..18e736b 100644 --- a/core/templates/core/article_list.html +++ b/core/templates/core/article_list.html @@ -17,7 +17,7 @@ background: white; padding: 30px; margin-bottom: 20px; - box-shadow: 0 2px 5px rgba(0,0,0,0.05); /* 添加轻微阴影 */ + box-shadow: 0 2px 5px rgba(0, 0, 0, 0.05); /* 添加轻微阴影 */ border-radius: 8px; /* 添加圆角 */ } @@ -116,7 +116,7 @@ padding: 8px 4px; color: #7f8c8d; } - + /* 新增:搜索框样式 */ .search-form { margin-bottom: 20px; @@ -124,7 +124,7 @@ background-color: #e3f2fd; /* 统一搜索框背景色调 */ border-radius: 5px; } - + .search-form input[type="text"] { padding: 8px 12px; border: 1px solid #bbdefb; /* 统一边框颜色 */ @@ -133,7 +133,7 @@ margin-right: 10px; background-color: #fff; } - + .search-form input[type="submit"] { padding: 8px 16px; background-color: #3498db; @@ -142,46 +142,46 @@ border-radius: 4px; cursor: pointer; } - + .search-form input[type="submit"]:hover { background-color: #2980b9; } - + .search-info { color: #78909c; /* 统一搜索信息颜色 */ font-size: 0.9em; margin-bottom: 10px; } - + /* 新增:左侧筛选栏样式 */ .content-wrapper { display: flex; gap: 20px; } - + .sidebar { flex: 0 0 200px; background-color: #e3f2fd; /* 统一边栏背景色调 */ border-radius: 5px; padding: 15px; } - + .main-content { flex: 1; } - + .sidebar .filters { margin-bottom: 20px; padding: 0; background-color: transparent; } - + .sidebar .filters strong { display: block; margin-bottom: 10px; color: #2c3e50; } - + .sidebar .filters a { display: block; padding: 8px 10px; @@ -191,12 +191,12 @@ text-decoration: none; border-radius: 3px; } - + .sidebar .filters a.active { background-color: #3498db; color: white; } - + /* 新增:导出功能样式 */ .export-section { margin-bottom: 20px; @@ -205,7 +205,7 @@ border-radius: 5px; text-align: center; } - + .export-btn { padding: 10px 20px; background-color: #4caf50; /* 统一按钮背景色调 */ @@ -216,118 +216,31 @@ font-size: 16px; margin: 0 5px; } - + .export-btn:hover { background-color: #388e3c; /* 统一按钮悬停色调 */ } - + .export-btn:disabled { background-color: #9e9e9e; /* 统一禁用按钮色调 */ cursor: not-allowed; } - + .article-checkbox { margin-right: 10px; } - - /* 新增:爬虫控制按钮样式 */ - .crawler-control { - margin-bottom: 20px; - padding: 15px; - background-color: #fff3e0; /* 统一爬虫控制区域背景色调 */ - border-radius: 5px; - text-align: center; - } - - .crawler-btn { - padding: 10px 20px; - background-color: #ff9800; /* 统一爬虫按钮背景色调 */ - color: white; - border: none; - border-radius: 4px; - cursor: pointer; - font-size: 16px; - margin: 0 5px; - } - - .crawler-btn:hover { - background-color: #f57c00; /* 统一爬虫按钮悬停色调 */ - } - - .crawler-btn:disabled { - background-color: #9e9e9e; /* 统一禁用爬虫按钮色调 */ - cursor: not-allowed; - } - - .crawler-result { - margin-top: 10px; - padding: 10px; - border-radius: 4px; - display: none; - } - - .crawler-result.success { - background-color: #e8f5e9; - color: #2e7d32; - } - - .crawler-result.error { - background-color: #ffebee; - color: #c62828; - } - - /* 新增:进度条样式 */ - .progress-container { - margin-top: 10px; - display: none; - } - - .progress-bar { - width: 100%; - height: 20px; - background-color: #e0e0e0; - border-radius: 10px; - overflow: hidden; - } - - .progress-fill { - height: 100%; - background-color: #4caf50; - width: 0%; - transition: width 0.3s ease; - } - - .progress-text { - margin-top: 5px; - font-size: 14px; - color: #666; - }

绿色课堂文章列表

- - -
- - - -
-
-
-
-
准备中...
-
-
-
{% if selected_website %} - + {% endif %}
@@ -338,23 +251,25 @@ - +
{% if search_query %} -
- 搜索 "{{ search_query }}" 找到 {{ page_obj.paginator.count }} 篇文章 - 清除搜索 -
+
+ 搜索 "{{ search_query }}" 找到 {{ page_obj.paginator.count }} 篇文章 + 清除搜索 +
{% endif %} - +
@@ -367,60 +282,70 @@
    {% for article in page_obj %} -
  • - - {{ article.title }} -
    ({{ article.website.name }} - {{ article.created_at|date:"Y-m-d" }})
    -
  • - {% empty %} -
  • 暂无文章
  • +
  • + + {{ article.title }} +
    ({{ article.website.name }} - {{ article.created_at|date:"Y-m-d" }})
    +
  • + {% empty %} +
  • 暂无文章
  • {% endfor %}
@@ -428,213 +353,6 @@
diff --git a/core/urls.py b/core/urls.py index fb2be6d..c833256 100644 --- a/core/urls.py +++ b/core/urls.py @@ -9,7 +9,9 @@ urlpatterns = [ path('run-crawler/', views.run_crawler, name='run_crawler'), # 新增:检查爬虫状态的路由 path('crawler-status/', views.crawler_status, name='crawler_status'), + # 新增:暂停爬虫的路由 + path('pause-crawler/', views.pause_crawler, name='pause_crawler'), # 添加导出文章的路由 path('export-articles/', views.export_articles, name='export_articles'), # 添加自定义管理后台的路由 -] \ No newline at end of file +] diff --git a/core/views.py b/core/views.py index b2721df..81a06f0 100644 --- a/core/views.py +++ b/core/views.py @@ -12,7 +12,6 @@ import csv from django.views.decorators.csrf import csrf_exempt from django.utils import timezone - # 用于跟踪爬虫任务状态的全局字典 crawler_tasks = {} @@ -73,7 +72,7 @@ def run_crawler(request): # 生成任务ID task_id = str(uuid.uuid4()) - + # 记录任务开始前的文章数量 initial_count = Article.objects.count() @@ -87,18 +86,18 @@ def run_crawler(request): 'start_time': timezone.now(), 'initial_count': initial_count } - + # 根据爬虫名称调用相应的命令 if crawler_name in ['crawl_xinhua', 'crawl_dongfangyancao']: call_command(crawler_name) else: # 如果是通用爬虫命令,使用crawl_articles call_command('crawl_articles', crawler_name) - + # 计算新增文章数量 final_count = Article.objects.count() added_count = final_count - initial_count - + # 更新任务状态为完成 crawler_tasks[task_id] = { 'status': 'completed', @@ -113,11 +112,11 @@ def run_crawler(request): error_msg = "检测到重复文章URL,已跳过重复项" else: print(f"爬虫执行出错: {e}") - + # 计算实际新增文章数量(即使有错误也统计) final_count = Article.objects.count() added_count = final_count - initial_count - + # 更新任务状态为完成(即使有部分错误) crawler_tasks[task_id] = { 'status': 'completed', @@ -147,17 +146,47 @@ def crawler_status(request): task_id = request.POST.get('task_id', '') if not task_id: return JsonResponse({'status': 'error', 'message': '任务ID不能为空'}) - + # 获取任务状态 task_info = crawler_tasks.get(task_id) if not task_info: return JsonResponse({'status': 'error', 'message': '未找到任务'}) - + return JsonResponse(task_info) except Exception as e: return JsonResponse({'status': 'error', 'message': str(e)}) +# 新增:暂停爬虫的视图 +@require_http_methods(["POST"]) +def pause_crawler(request): + """ + 暂停爬虫任务 + """ + try: + task_id = request.POST.get('task_id', '') + if not task_id: + return JsonResponse({'status': 'error', 'message': '任务ID不能为空'}) + + # 获取任务状态 + task_info = crawler_tasks.get(task_id) + if not task_info: + return JsonResponse({'status': 'error', 'message': '未找到任务'}) + + # 在实际应用中,这里应该实现真正的暂停逻辑 + # 目前我们只是更新任务状态来模拟暂停功能 + task_info['status'] = 'paused' + task_info['message'] = '爬虫已暂停' + + return JsonResponse({ + 'status': 'success', + 'message': '爬虫已暂停', + 'progress': 0 # 这里应该返回实际进度 + }) + except Exception as e: + return JsonResponse({'status': 'error', 'message': str(e)}) + + # 新增:文章导出视图 @csrf_exempt @require_http_methods(["POST"]) @@ -167,13 +196,13 @@ def export_articles(request): data = json.loads(request.body) article_ids = data.get('article_ids', []) format_type = data.get('format', 'json') - + # 获取选中的文章 articles = Article.objects.filter(id__in=article_ids) - + if not articles.exists(): return HttpResponse('没有选中文章', status=400) - + # 根据格式类型导出 if format_type == 'json': # 准备JSON数据 @@ -189,7 +218,7 @@ def export_articles(request): 'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'), 'media_files': article.media_files }) - + # 创建JSON响应 response = HttpResponse( json.dumps(articles_data, ensure_ascii=False, indent=2), @@ -197,16 +226,16 @@ def export_articles(request): ) response['Content-Disposition'] = 'attachment; filename="articles.json"' return response - + elif format_type == 'csv': # 创建CSV响应 response = HttpResponse(content_type='text/csv') response['Content-Disposition'] = 'attachment; filename="articles.csv"' - + # 创建CSV写入器 writer = csv.writer(response) writer.writerow(['ID', '标题', '网站', 'URL', '发布时间', '内容', '创建时间', '媒体文件']) - + # 写入文章数据 for article in articles: writer.writerow([ @@ -219,25 +248,25 @@ def export_articles(request): article.created_at.strftime('%Y-%m-%d %H:%M:%S'), ';'.join(article.media_files) if article.media_files else '' ]) - + return response - + # 新增:支持ZIP格式导出 elif format_type == 'zip': import zipfile from io import BytesIO from django.conf import settings import os - + # 创建内存中的ZIP文件 zip_buffer = BytesIO() - + with zipfile.ZipFile(zip_buffer, 'w') as zip_file: # 为每篇文章创建Word文档并添加到ZIP文件中 for article in articles: # 为每篇文章创建单独的文件夹 article_folder = f"article_{article.id}_{article.title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('*', '_').replace('?', '_').replace('"', '_').replace('<', '_').replace('>', '_').replace('|', '_')}" - + # 创建文章数据 article_data = { 'id': article.id, @@ -249,7 +278,7 @@ def export_articles(request): 'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'), 'media_files': article.media_files } - + # 将文章数据保存为Word文件并添加到ZIP try: from docx import Document @@ -257,23 +286,24 @@ def export_articles(request): from io import BytesIO from bs4 import BeautifulSoup import requests - + # 创建Word文档 doc = Document() doc.add_heading(article.title, 0) - + # 添加文章元数据 doc.add_paragraph(f"网站: {article.website.name}") doc.add_paragraph(f"URL: {article.url}") - doc.add_paragraph(f"发布时间: {article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else 'N/A'}") + doc.add_paragraph( + f"发布时间: {article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else 'N/A'}") doc.add_paragraph(f"创建时间: {article.created_at.strftime('%Y-%m-%d %H:%M:%S')}") - + # 添加文章内容 doc.add_heading('内容', level=1) - + # 处理HTML内容 soup = BeautifulSoup(article.content, 'html.parser') - + # 处理内容中的图片 for img in soup.find_all('img'): src = img.get('src', '') @@ -293,13 +323,13 @@ def export_articles(request): except Exception as e: # 如果添加图片失败,添加图片URL作为文本 doc.add_paragraph(f"[图片: {src}]") - + # 移除原始img标签 img.decompose() - + content_text = soup.get_text() doc.add_paragraph(content_text) - + # 添加媒体文件信息 if article.media_files: doc.add_heading('媒体文件', level=1) @@ -309,7 +339,7 @@ def export_articles(request): if os.path.exists(full_path): # 检查文件扩展名以确定处理方式 file_extension = os.path.splitext(media_file)[1].lower() - + # 图片文件处理 if file_extension in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']: doc.add_picture(full_path, width=Inches(4.0)) @@ -324,7 +354,7 @@ def export_articles(request): if media_file.startswith('http'): response = requests.get(media_file, timeout=10) file_extension = os.path.splitext(media_file)[1].lower() - + # 图片文件处理 if file_extension in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']: image_stream = BytesIO(response.content) @@ -335,20 +365,22 @@ def export_articles(request): doc.add_paragraph(media_file) except Exception as e: doc.add_paragraph(media_file) - + # 保存Word文档到内存 doc_buffer = BytesIO() doc.save(doc_buffer) doc_buffer.seek(0) - + # 将Word文档添加到ZIP包 - zip_file.writestr(os.path.join(article_folder, f'{article.title.replace("/", "_")}.docx'), doc_buffer.read()) - + zip_file.writestr(os.path.join(article_folder, f'{article.title.replace("/", "_")}.docx'), + doc_buffer.read()) + except ImportError: # 如果没有安装python-docx库,回退到JSON格式 json_data = json.dumps(article_data, ensure_ascii=False, indent=2) - zip_file.writestr(os.path.join(article_folder, f'{article.title.replace("/", "_")}.json'), json_data) - + zip_file.writestr(os.path.join(article_folder, f'{article.title.replace("/", "_")}.json'), + json_data) + # 添加媒体文件到ZIP包 if article.media_files: for media_file in article.media_files: @@ -362,19 +394,21 @@ def export_articles(request): if media_file.startswith('http'): import requests response = requests.get(media_file, timeout=10) - zip_file.writestr(os.path.join(article_folder, 'media', os.path.basename(media_file)), response.content) + zip_file.writestr( + os.path.join(article_folder, 'media', os.path.basename(media_file)), + response.content) except Exception as e: # 如果添加媒体文件失败,继续处理其他文件 pass - + # 创建HttpResponse zip_buffer.seek(0) response = HttpResponse(zip_buffer.getvalue(), content_type='application/zip') response['Content-Disposition'] = 'attachment; filename=articles_export.zip' return response - + else: return HttpResponse('不支持的格式', status=400) - + except Exception as e: - return HttpResponse(f'导出失败: {str(e)}', status=500) \ No newline at end of file + return HttpResponse(f'导出失败: {str(e)}', status=500)