import uuid from django.shortcuts import render from django.core.paginator import Paginator from django.http import JsonResponse from django.views.decorators.http import require_http_methods from django.core.management import call_command from .models import Article, Website, SiteConfig import threading from django.http import HttpResponse import json import csv from django.views.decorators.csrf import csrf_exempt from django.utils import timezone # 用于跟踪爬虫任务状态的全局字典 crawler_tasks = {} def article_list(request): # 获取所有启用的网站 websites = Website.objects.filter(enabled=True) # 获取筛选网站 selected_website = None # 修改:确保始终获取所有文章,除非有特定筛选 articles = Article.objects.all() website_id = request.GET.get('website') if website_id: try: selected_website = Website.objects.get(id=website_id) articles = articles.filter(website=selected_website) except Website.DoesNotExist: pass # 处理关键词搜索 search_query = request.GET.get('q') if search_query: articles = articles.filter(title__icontains=search_query) # 新增:处理媒体类型筛选 media_type = request.GET.get('media_type', 'all') if media_type == 'text_only': # 纯文本文章(没有媒体文件) articles = articles.filter(media_files__isnull=True) | articles.filter(media_files=[]) elif media_type == 'with_images': # 包含图片的文章 articles = articles.filter(media_files__icontains='.jpg') | \ articles.filter(media_files__icontains='.jpeg') | \ articles.filter(media_files__icontains='.png') | \ articles.filter(media_files__icontains='.gif') elif media_type == 'with_videos': # 包含视频的文章 articles = articles.filter(media_files__icontains='.mp4') | \ articles.filter(media_files__icontains='.avi') | \ articles.filter(media_files__icontains='.mov') | \ articles.filter(media_files__icontains='.wmv') | \ articles.filter(media_files__icontains='.flv') | \ articles.filter(media_files__icontains='.webm') # 按创建时间倒序排列 articles = articles.order_by('-created_at') # 分页 paginator = Paginator(articles, 40) # 每页显示10篇文章 page_number = request.GET.get('page') page_obj = paginator.get_page(page_number) # 获取网站配置 site_config = SiteConfig.get_config() return render(request, 'core/article_list.html', { 'page_obj': page_obj, 'websites': websites, 'selected_website': selected_website, 'search_query': search_query, 'site_config': site_config }) def article_detail(request, article_id): article = Article.objects.get(id=article_id) return render(request, 'core/article_detail.html', {'article': article}) # 添加任务ID生成和状态跟踪 @require_http_methods(["POST"]) def run_crawler(request): """ 从前台触发爬虫任务 """ try: # 获取要执行的爬虫名称 crawler_name = request.POST.get('crawler_name', '') if not crawler_name: return JsonResponse({'status': 'error', 'message': '爬虫名称不能为空'}) # 生成任务ID task_id = str(uuid.uuid4()) # 记录任务开始前的文章数量 initial_count = Article.objects.count() # 在后台线程中运行爬虫任务 def run_spider(): try: # 更新任务状态为运行中 crawler_tasks[task_id] = { 'status': 'running', 'message': '爬虫正在运行...', 'start_time': timezone.now(), 'initial_count': initial_count } # 根据爬虫名称调用相应的命令 if crawler_name in ['crawl_xinhua', 'crawl_dongfangyancao']: call_command(crawler_name) else: # 如果是通用爬虫命令,使用crawl_articles call_command('crawl_articles', crawler_name) # 计算新增文章数量 final_count = Article.objects.count() added_count = final_count - initial_count # 更新任务状态为完成 crawler_tasks[task_id] = { 'status': 'completed', 'message': f'爬虫已完成,新增 {added_count} 篇文章', 'added_count': added_count, 'end_time': timezone.now() } except Exception as e: # 修改:改进错误处理,提供更友好的错误信息 error_msg = str(e) if "UNIQUE constraint failed" in error_msg and "core_article.url" in error_msg: error_msg = "检测到重复文章URL,已跳过重复项" else: print(f"爬虫执行出错: {e}") # 计算实际新增文章数量(即使有错误也统计) final_count = Article.objects.count() added_count = final_count - initial_count # 更新任务状态为完成(即使有部分错误) crawler_tasks[task_id] = { 'status': 'completed', 'message': f'爬虫已完成,新增 {added_count} 篇文章。{error_msg}', 'added_count': added_count, 'end_time': timezone.now(), 'error': error_msg } # 启动后台线程执行爬虫 thread = threading.Thread(target=run_spider) thread.daemon = True thread.start() return JsonResponse({'status': 'success', 'message': f'爬虫 {crawler_name} 已启动', 'task_id': task_id}) except Exception as e: return JsonResponse({'status': 'error', 'message': str(e)}) # 检查爬虫状态的视图 @require_http_methods(["POST"]) def crawler_status(request): """ 检查爬虫任务状态 """ try: task_id = request.POST.get('task_id', '') if not task_id: return JsonResponse({'status': 'error', 'message': '任务ID不能为空'}) # 获取任务状态 task_info = crawler_tasks.get(task_id) if not task_info: return JsonResponse({'status': 'error', 'message': '未找到任务'}) return JsonResponse(task_info) except Exception as e: return JsonResponse({'status': 'error', 'message': str(e)}) # 新增:暂停爬虫的视图 @require_http_methods(["POST"]) def pause_crawler(request): """ 暂停爬虫任务 """ try: task_id = request.POST.get('task_id', '') if not task_id: return JsonResponse({'status': 'error', 'message': '任务ID不能为空'}) # 获取任务状态 task_info = crawler_tasks.get(task_id) if not task_info: return JsonResponse({'status': 'error', 'message': '未找到任务'}) # 在实际应用中,这里应该实现真正的暂停逻辑 # 目前我们只是更新任务状态来模拟暂停功能 task_info['status'] = 'paused' task_info['message'] = '爬虫已暂停' return JsonResponse({ 'status': 'success', 'message': '爬虫已暂停', 'progress': 0 # 这里应该返回实际进度 }) except Exception as e: return JsonResponse({'status': 'error', 'message': str(e)}) # 新增:文章导出视图 @csrf_exempt @require_http_methods(["POST"]) def export_articles(request): try: # 解析请求数据 data = json.loads(request.body) article_ids = data.get('article_ids', []) format_type = data.get('format', 'json') # 获取选中的文章 articles = Article.objects.filter(id__in=article_ids) if not articles.exists(): return HttpResponse('没有选中文章', status=400) # 根据格式类型导出 if format_type == 'json': # 准备JSON数据 articles_data = [] for article in articles: articles_data.append({ 'id': article.id, 'title': article.title, 'website': article.website.name, 'url': article.url, 'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None, 'content': article.content, 'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'), 'media_files': article.media_files }) # 创建JSON响应 response = HttpResponse( json.dumps(articles_data, ensure_ascii=False, indent=2), content_type='application/json' ) response['Content-Disposition'] = 'attachment; filename="articles.json"' return response elif format_type == 'csv': # 创建CSV响应 response = HttpResponse(content_type='text/csv') response['Content-Disposition'] = 'attachment; filename="articles.csv"' # 创建CSV写入器 writer = csv.writer(response) writer.writerow(['ID', '标题', '网站', 'URL', '发布时间', '内容', '创建时间', '媒体文件']) # 写入文章数据 for article in articles: writer.writerow([ article.id, article.title, article.website.name, article.url, article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else '', article.content, article.created_at.strftime('%Y-%m-%d %H:%M:%S'), ';'.join(article.media_files) if article.media_files else '' ]) return response # 新增:支持ZIP格式导出 elif format_type == 'zip': import zipfile from io import BytesIO from django.conf import settings import os # 创建内存中的ZIP文件 zip_buffer = BytesIO() with zipfile.ZipFile(zip_buffer, 'w') as zip_file: # 为每篇文章创建Word文档并添加到ZIP文件中 for article in articles: # 为每篇文章创建单独的文件夹 article_folder = f"article_{article.id}_{article.title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('*', '_').replace('?', '_').replace('"', '_').replace('<', '_').replace('>', '_').replace('|', '_')}" # 创建文章数据 article_data = { 'id': article.id, 'title': article.title, 'website': article.website.name, 'url': article.url, 'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None, 'content': article.content, 'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'), 'media_files': article.media_files } # 将文章数据保存为Word文件并添加到ZIP try: from docx import Document from docx.shared import Inches from io import BytesIO from bs4 import BeautifulSoup import requests # 创建Word文档 doc = Document() doc.add_heading(article.title, 0) # 添加文章元数据 doc.add_paragraph(f"网站: {article.website.name}") doc.add_paragraph(f"URL: {article.url}") doc.add_paragraph( f"发布时间: {article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else 'N/A'}") doc.add_paragraph(f"创建时间: {article.created_at.strftime('%Y-%m-%d %H:%M:%S')}") # 添加文章内容 doc.add_heading('内容', level=1) # 处理HTML内容 soup = BeautifulSoup(article.content, 'html.parser') # 处理内容中的图片 for img in soup.find_all('img'): src = img.get('src', '') if src: try: # 构建完整的图片路径 if src.startswith('http'): # 网络图片 response = requests.get(src, timeout=10) image_stream = BytesIO(response.content) doc.add_picture(image_stream, width=Inches(4.0)) else: # 本地图片 full_path = os.path.join(settings.MEDIA_ROOT, src.lstrip('/')) if os.path.exists(full_path): doc.add_picture(full_path, width=Inches(4.0)) except Exception as e: # 如果添加图片失败,添加图片URL作为文本 doc.add_paragraph(f"[图片: {src}]") # 移除原始img标签 img.decompose() content_text = soup.get_text() doc.add_paragraph(content_text) # 添加媒体文件信息 if article.media_files: doc.add_heading('媒体文件', level=1) for media_file in article.media_files: try: full_path = os.path.join(settings.MEDIA_ROOT, media_file) if os.path.exists(full_path): # 检查文件扩展名以确定处理方式 file_extension = os.path.splitext(media_file)[1].lower() # 图片文件处理 if file_extension in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']: doc.add_picture(full_path, width=Inches(4.0)) # 视频文件处理 elif file_extension in ['.mp4', '.avi', '.mov', '.wmv', '.flv', '.webm']: doc.add_paragraph(f"[视频文件: {media_file}]") # 其他文件类型 else: doc.add_paragraph(f"[文件: {media_file}]") else: # 如果是URL格式的媒体文件 if media_file.startswith('http'): response = requests.get(media_file, timeout=10) file_extension = os.path.splitext(media_file)[1].lower() # 图片文件处理 if file_extension in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']: image_stream = BytesIO(response.content) doc.add_picture(image_stream, width=Inches(4.0)) else: doc.add_paragraph(f"[文件: {media_file}]") else: doc.add_paragraph(media_file) except Exception as e: doc.add_paragraph(media_file) # 保存Word文档到内存 doc_buffer = BytesIO() doc.save(doc_buffer) doc_buffer.seek(0) # 将Word文档添加到ZIP包 zip_file.writestr(os.path.join(article_folder, f'{article.title.replace("/", "_")}.docx'), doc_buffer.read()) except ImportError: # 如果没有安装python-docx库,回退到JSON格式 json_data = json.dumps(article_data, ensure_ascii=False, indent=2) zip_file.writestr(os.path.join(article_folder, f'{article.title.replace("/", "_")}.json'), json_data) # 添加媒体文件到ZIP包 if article.media_files: for media_file in article.media_files: try: full_path = os.path.join(settings.MEDIA_ROOT, media_file) if os.path.exists(full_path): # 添加文件到ZIP包 zip_file.write(full_path, os.path.join(article_folder, 'media', media_file)) else: # 如果是URL格式的媒体文件 if media_file.startswith('http'): import requests response = requests.get(media_file, timeout=10) zip_file.writestr( os.path.join(article_folder, 'media', os.path.basename(media_file)), response.content) except Exception as e: # 如果添加媒体文件失败,继续处理其他文件 pass # 创建HttpResponse zip_buffer.seek(0) response = HttpResponse(zip_buffer.getvalue(), content_type='application/zip') response['Content-Disposition'] = 'attachment; filename=articles_export.zip' return response else: return HttpResponse('不支持的格式', status=400) except Exception as e: return HttpResponse(f'导出失败: {str(e)}', status=500) # 新增:按媒体类型导出文章视图 @csrf_exempt @require_http_methods(["POST"]) def export_articles_by_type(request): try: # 解析请求数据 data = json.loads(request.body) media_type = data.get('media_type', 'all') format_type = data.get('format', 'zip') # 根据媒体类型筛选文章 if media_type == 'text_only': # 纯文本文章(没有媒体文件或媒体文件为空) articles = Article.objects.filter(media_files__isnull=True) | Article.objects.filter(media_files=[]) elif media_type == 'with_images': # 包含图片的文章 articles = Article.objects.filter(media_files__icontains='.jpg') | \ Article.objects.filter(media_files__icontains='.jpeg') | \ Article.objects.filter(media_files__icontains='.png') | \ Article.objects.filter(media_files__icontains='.gif') elif media_type == 'with_videos': # 包含视频的文章 articles = Article.objects.filter(media_files__icontains='.mp4') | \ Article.objects.filter(media_files__icontains='.avi') | \ Article.objects.filter(media_files__icontains='.mov') | \ Article.objects.filter(media_files__icontains='.wmv') | \ Article.objects.filter(media_files__icontains='.flv') | \ Article.objects.filter(media_files__icontains='.webm') else: # 所有文章 articles = Article.objects.all() # 去重处理 articles = articles.distinct() if not articles.exists(): return HttpResponse('没有符合条件的文章', status=400) # 导出为ZIP格式 if format_type == 'zip': import zipfile from io import BytesIO from django.conf import settings import os # 创建内存中的ZIP文件 zip_buffer = BytesIO() with zipfile.ZipFile(zip_buffer, 'w') as zip_file: # 为每篇文章创建Word文档并添加到ZIP文件中 for article in articles: # 为每篇文章创建单独的文件夹 article_folder = f"article_{article.id}_{article.title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('*', '_').replace('?', '_').replace('"', '_').replace('<', '_').replace('>', '_').replace('|', '_')}" # 创建文章数据 article_data = { 'id': article.id, 'title': article.title, 'website': article.website.name, 'url': article.url, 'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None, 'content': article.content, 'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'), 'media_files': article.media_files } # 将文章数据保存为Word文件并添加到ZIP try: from docx import Document from docx.shared import Inches from io import BytesIO from bs4 import BeautifulSoup import requests # 创建Word文档 doc = Document() doc.add_heading(article.title, 0) # 添加文章元数据 doc.add_paragraph(f"网站: {article.website.name}") doc.add_paragraph(f"URL: {article.url}") doc.add_paragraph( f"发布时间: {article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else 'N/A'}") doc.add_paragraph(f"创建时间: {article.created_at.strftime('%Y-%m-%d %H:%M:%S')}") # 添加文章内容 doc.add_heading('内容', level=1) # 处理HTML内容 soup = BeautifulSoup(article.content, 'html.parser') # 处理内容中的图片 for img in soup.find_all('img'): src = img.get('src', '') if src: try: # 构建完整的图片路径 if src.startswith('http'): # 网络图片 response = requests.get(src, timeout=10) image_stream = BytesIO(response.content) doc.add_picture(image_stream, width=Inches(4.0)) else: # 本地图片 full_path = os.path.join(settings.MEDIA_ROOT, src.lstrip('/')) if os.path.exists(full_path): doc.add_picture(full_path, width=Inches(4.0)) except Exception as e: # 如果添加图片失败,添加图片URL作为文本 doc.add_paragraph(f"[图片: {src}]") # 移除原始img标签 img.decompose() content_text = soup.get_text() doc.add_paragraph(content_text) # 添加媒体文件信息 if article.media_files: doc.add_heading('媒体文件', level=1) for media_file in article.media_files: try: full_path = os.path.join(settings.MEDIA_ROOT, media_file) if os.path.exists(full_path): # 检查文件扩展名以确定处理方式 file_extension = os.path.splitext(media_file)[1].lower() # 图片文件处理 if file_extension in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']: doc.add_picture(full_path, width=Inches(4.0)) # 视频文件处理 elif file_extension in ['.mp4', '.avi', '.mov', '.wmv', '.flv', '.webm']: doc.add_paragraph(f"[视频文件: {media_file}]") # 其他文件类型 else: doc.add_paragraph(f"[文件: {media_file}]") else: # 如果是URL格式的媒体文件 if media_file.startswith('http'): response = requests.get(media_file, timeout=10) file_extension = os.path.splitext(media_file)[1].lower() # 图片文件处理 if file_extension in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']: image_stream = BytesIO(response.content) doc.add_picture(image_stream, width=Inches(4.0)) else: doc.add_paragraph(f"[文件: {media_file}]") else: doc.add_paragraph(media_file) except Exception as e: doc.add_paragraph(media_file) # 保存Word文档到内存 doc_buffer = BytesIO() doc.save(doc_buffer) doc_buffer.seek(0) # 将Word文档添加到ZIP包 zip_file.writestr(os.path.join(article_folder, f'{article.title.replace("/", "_")}.docx'), doc_buffer.read()) except ImportError: # 如果没有安装python-docx库,回退到JSON格式 json_data = json.dumps(article_data, ensure_ascii=False, indent=2) zip_file.writestr(os.path.join(article_folder, f'{article.title.replace("/", "_")}.json'), json_data) # 添加媒体文件到ZIP包 if article.media_files: for media_file in article.media_files: try: full_path = os.path.join(settings.MEDIA_ROOT, media_file) if os.path.exists(full_path): # 添加文件到ZIP包 zip_file.write(full_path, os.path.join(article_folder, 'media', media_file)) else: # 如果是URL格式的媒体文件 if media_file.startswith('http'): import requests response = requests.get(media_file, timeout=10) zip_file.writestr( os.path.join(article_folder, 'media', os.path.basename(media_file)), response.content) except Exception as e: # 如果添加媒体文件失败,继续处理其他文件 pass # 创建HttpResponse zip_buffer.seek(0) response = HttpResponse(zip_buffer.getvalue(), content_type='application/zip') response['Content-Disposition'] = f'attachment; filename=articles_{media_type}.zip' return response else: return HttpResponse('不支持的格式', status=400) except Exception as e: return HttpResponse(f'导出失败: {str(e)}', status=500)