Unknow change

This commit is contained in:
2025-08-13 18:40:31 +08:00
parent 5e396796ca
commit c618528a0a
6 changed files with 996 additions and 528 deletions

View File

@@ -12,7 +12,6 @@ import csv
from django.views.decorators.csrf import csrf_exempt
from django.utils import timezone
# 用于跟踪爬虫任务状态的全局字典
crawler_tasks = {}
@@ -73,7 +72,7 @@ def run_crawler(request):
# 生成任务ID
task_id = str(uuid.uuid4())
# 记录任务开始前的文章数量
initial_count = Article.objects.count()
@@ -87,18 +86,18 @@ def run_crawler(request):
'start_time': timezone.now(),
'initial_count': initial_count
}
# 根据爬虫名称调用相应的命令
if crawler_name in ['crawl_xinhua', 'crawl_dongfangyancao']:
call_command(crawler_name)
else:
# 如果是通用爬虫命令使用crawl_articles
call_command('crawl_articles', crawler_name)
# 计算新增文章数量
final_count = Article.objects.count()
added_count = final_count - initial_count
# 更新任务状态为完成
crawler_tasks[task_id] = {
'status': 'completed',
@@ -113,11 +112,11 @@ def run_crawler(request):
error_msg = "检测到重复文章URL已跳过重复项"
else:
print(f"爬虫执行出错: {e}")
# 计算实际新增文章数量(即使有错误也统计)
final_count = Article.objects.count()
added_count = final_count - initial_count
# 更新任务状态为完成(即使有部分错误)
crawler_tasks[task_id] = {
'status': 'completed',
@@ -147,17 +146,47 @@ def crawler_status(request):
task_id = request.POST.get('task_id', '')
if not task_id:
return JsonResponse({'status': 'error', 'message': '任务ID不能为空'})
# 获取任务状态
task_info = crawler_tasks.get(task_id)
if not task_info:
return JsonResponse({'status': 'error', 'message': '未找到任务'})
return JsonResponse(task_info)
except Exception as e:
return JsonResponse({'status': 'error', 'message': str(e)})
# 新增:暂停爬虫的视图
@require_http_methods(["POST"])
def pause_crawler(request):
"""
暂停爬虫任务
"""
try:
task_id = request.POST.get('task_id', '')
if not task_id:
return JsonResponse({'status': 'error', 'message': '任务ID不能为空'})
# 获取任务状态
task_info = crawler_tasks.get(task_id)
if not task_info:
return JsonResponse({'status': 'error', 'message': '未找到任务'})
# 在实际应用中,这里应该实现真正的暂停逻辑
# 目前我们只是更新任务状态来模拟暂停功能
task_info['status'] = 'paused'
task_info['message'] = '爬虫已暂停'
return JsonResponse({
'status': 'success',
'message': '爬虫已暂停',
'progress': 0 # 这里应该返回实际进度
})
except Exception as e:
return JsonResponse({'status': 'error', 'message': str(e)})
# 新增:文章导出视图
@csrf_exempt
@require_http_methods(["POST"])
@@ -167,13 +196,13 @@ def export_articles(request):
data = json.loads(request.body)
article_ids = data.get('article_ids', [])
format_type = data.get('format', 'json')
# 获取选中的文章
articles = Article.objects.filter(id__in=article_ids)
if not articles.exists():
return HttpResponse('没有选中文章', status=400)
# 根据格式类型导出
if format_type == 'json':
# 准备JSON数据
@@ -189,7 +218,7 @@ def export_articles(request):
'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'),
'media_files': article.media_files
})
# 创建JSON响应
response = HttpResponse(
json.dumps(articles_data, ensure_ascii=False, indent=2),
@@ -197,16 +226,16 @@ def export_articles(request):
)
response['Content-Disposition'] = 'attachment; filename="articles.json"'
return response
elif format_type == 'csv':
# 创建CSV响应
response = HttpResponse(content_type='text/csv')
response['Content-Disposition'] = 'attachment; filename="articles.csv"'
# 创建CSV写入器
writer = csv.writer(response)
writer.writerow(['ID', '标题', '网站', 'URL', '发布时间', '内容', '创建时间', '媒体文件'])
# 写入文章数据
for article in articles:
writer.writerow([
@@ -219,25 +248,25 @@ def export_articles(request):
article.created_at.strftime('%Y-%m-%d %H:%M:%S'),
';'.join(article.media_files) if article.media_files else ''
])
return response
# 新增:支持ZIP格式导出
elif format_type == 'zip':
import zipfile
from io import BytesIO
from django.conf import settings
import os
# 创建内存中的ZIP文件
zip_buffer = BytesIO()
with zipfile.ZipFile(zip_buffer, 'w') as zip_file:
# 为每篇文章创建Word文档并添加到ZIP文件中
for article in articles:
# 为每篇文章创建单独的文件夹
article_folder = f"article_{article.id}_{article.title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('*', '_').replace('?', '_').replace('"', '_').replace('<', '_').replace('>', '_').replace('|', '_')}"
# 创建文章数据
article_data = {
'id': article.id,
@@ -249,7 +278,7 @@ def export_articles(request):
'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'),
'media_files': article.media_files
}
# 将文章数据保存为Word文件并添加到ZIP
try:
from docx import Document
@@ -257,23 +286,24 @@ def export_articles(request):
from io import BytesIO
from bs4 import BeautifulSoup
import requests
# 创建Word文档
doc = Document()
doc.add_heading(article.title, 0)
# 添加文章元数据
doc.add_paragraph(f"网站: {article.website.name}")
doc.add_paragraph(f"URL: {article.url}")
doc.add_paragraph(f"发布时间: {article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else 'N/A'}")
doc.add_paragraph(
f"发布时间: {article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else 'N/A'}")
doc.add_paragraph(f"创建时间: {article.created_at.strftime('%Y-%m-%d %H:%M:%S')}")
# 添加文章内容
doc.add_heading('内容', level=1)
# 处理HTML内容
soup = BeautifulSoup(article.content, 'html.parser')
# 处理内容中的图片
for img in soup.find_all('img'):
src = img.get('src', '')
@@ -293,13 +323,13 @@ def export_articles(request):
except Exception as e:
# 如果添加图片失败添加图片URL作为文本
doc.add_paragraph(f"[图片: {src}]")
# 移除原始img标签
img.decompose()
content_text = soup.get_text()
doc.add_paragraph(content_text)
# 添加媒体文件信息
if article.media_files:
doc.add_heading('媒体文件', level=1)
@@ -309,7 +339,7 @@ def export_articles(request):
if os.path.exists(full_path):
# 检查文件扩展名以确定处理方式
file_extension = os.path.splitext(media_file)[1].lower()
# 图片文件处理
if file_extension in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']:
doc.add_picture(full_path, width=Inches(4.0))
@@ -324,7 +354,7 @@ def export_articles(request):
if media_file.startswith('http'):
response = requests.get(media_file, timeout=10)
file_extension = os.path.splitext(media_file)[1].lower()
# 图片文件处理
if file_extension in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']:
image_stream = BytesIO(response.content)
@@ -335,20 +365,22 @@ def export_articles(request):
doc.add_paragraph(media_file)
except Exception as e:
doc.add_paragraph(media_file)
# 保存Word文档到内存
doc_buffer = BytesIO()
doc.save(doc_buffer)
doc_buffer.seek(0)
# 将Word文档添加到ZIP包
zip_file.writestr(os.path.join(article_folder, f'{article.title.replace("/", "_")}.docx'), doc_buffer.read())
zip_file.writestr(os.path.join(article_folder, f'{article.title.replace("/", "_")}.docx'),
doc_buffer.read())
except ImportError:
# 如果没有安装python-docx库回退到JSON格式
json_data = json.dumps(article_data, ensure_ascii=False, indent=2)
zip_file.writestr(os.path.join(article_folder, f'{article.title.replace("/", "_")}.json'), json_data)
zip_file.writestr(os.path.join(article_folder, f'{article.title.replace("/", "_")}.json'),
json_data)
# 添加媒体文件到ZIP包
if article.media_files:
for media_file in article.media_files:
@@ -362,19 +394,21 @@ def export_articles(request):
if media_file.startswith('http'):
import requests
response = requests.get(media_file, timeout=10)
zip_file.writestr(os.path.join(article_folder, 'media', os.path.basename(media_file)), response.content)
zip_file.writestr(
os.path.join(article_folder, 'media', os.path.basename(media_file)),
response.content)
except Exception as e:
# 如果添加媒体文件失败,继续处理其他文件
pass
# 创建HttpResponse
zip_buffer.seek(0)
response = HttpResponse(zip_buffer.getvalue(), content_type='application/zip')
response['Content-Disposition'] = 'attachment; filename=articles_export.zip'
return response
else:
return HttpResponse('不支持的格式', status=400)
except Exception as e:
return HttpResponse(f'导出失败: {str(e)}', status=500)
return HttpResponse(f'导出失败: {str(e)}', status=500)