Files
icac/crawler/views.py
2025-09-23 15:01:36 +08:00

435 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from django.shortcuts import render, get_object_or_404
from django.http import HttpResponse, Http404
from django.db.models import Q, Count
from django.conf import settings
from django.utils import timezone
from django.core.files.storage import default_storage
from .models import Website, CrawlTask, CrawledContent, CrawlLog, SearchKeyword, MediaFile
from rest_framework import viewsets, filters
from rest_framework.decorators import action
from rest_framework.response import Response
from .serializers import (
WebsiteSerializer, CrawlTaskSerializer, CrawledContentSerializer,
CrawlLogSerializer, SearchKeywordSerializer
)
import json
from django.core.paginator import Paginator
from django.db.models.functions import TruncDate
from django.db.models import Count
import os
import tempfile
import zipfile
from io import BytesIO
from docx import Document
from django.core.files.base import ContentFile
def dashboard(request):
"""仪表板视图"""
# 统计数据
total_websites = Website.objects.filter(is_active=True).count()
total_tasks = CrawlTask.objects.count()
total_contents = CrawledContent.objects.count()
active_tasks = CrawlTask.objects.filter(status='running').count()
# 获取所有网站
websites = Website.objects.filter(is_active=True).order_by('name')
# 获取当前选中的网站ID
selected_website_id = request.GET.get('website')
# 获取分页参数
page_number = request.GET.get('page', 1)
page_size = request.GET.get('page_size', 20) # 默认每页20篇文章
# 尝试转换page_size为整数
try:
page_size = int(page_size)
# 限制page_size在合理范围内
page_size = max(10, min(100, page_size))
except (ValueError, TypeError):
page_size = 20
# 获取所有爬取的内容,按网站和创建时间排序
all_contents = CrawledContent.objects.select_related('website').order_by('website__name', '-created_at')
# 如果选择了特定网站,则进行过滤
if selected_website_id:
try:
selected_website_id = int(selected_website_id)
all_contents = all_contents.filter(website_id=selected_website_id)
except (ValueError, TypeError):
pass
# 分页处理
paginator = Paginator(all_contents, page_size)
page_obj = paginator.get_page(page_number)
# 按网站分组内容
contents_by_website = {}
for content in page_obj:
website_name = content.website.name
if website_name not in contents_by_website:
contents_by_website[website_name] = []
contents_by_website[website_name].append(content)
# 最近的任务
recent_tasks = CrawlTask.objects.order_by('-created_at')[:5]
# 媒体文件统计
total_media_files = CrawledContent.objects.aggregate(
total_media=Count('media_files')
)['total_media'] or 0
stats = {
'total_websites': total_websites,
'total_tasks': total_tasks,
'total_contents': total_contents,
'active_tasks': active_tasks,
'websites': websites,
'selected_website_id': selected_website_id,
'page_obj': page_obj,
'contents_by_website': contents_by_website,
'page_size': page_size,
'recent_tasks': recent_tasks,
'total_media_files': total_media_files,
}
return render(request, 'crawler/dashboard.html', {'stats': stats})
def search_page(request):
"""搜索页面视图"""
keyword = request.GET.get('q', '').strip()
contents = []
if keyword:
# 记录搜索关键字
SearchKeyword.objects.get_or_create(
keyword=keyword,
defaults={'last_used': timezone.now()}
)
# 搜索内容
contents = CrawledContent.objects.filter(
Q(title__icontains=keyword) |
Q(content__icontains=keyword) |
Q(keywords_matched__icontains=keyword)
).order_by('-created_at')[:50]
return render(request, 'crawler/search.html', {
'keyword': keyword,
'contents': contents
})
def preview_crawled_content(request, content_id):
"""预览爬取的内容"""
content = get_object_or_404(CrawledContent, id=content_id)
# 获取媒体文件
media_files = content.media_files.all()
# 生成媒体文件HTML
media_section = ""
if media_files:
media_section = """
<div class="media-section">
<h3>媒体文件</h3>
"""
for media_file in media_files:
if media_file.media_type == 'image':
media_section += f"""
<div class="media-item">
<h4>图片: {media_file.alt_text or '无标题'}</h4>
<img src="/media/{media_file.local_file.name}" alt="{media_file.alt_text}" style="max-width: 100%; height: auto;">
<p><small>原始URL: <a href="{media_file.original_url}" target="_blank">{media_file.original_url}</a></small></p>
<p><small>文件大小: {media_file.file_size_display}</small></p>
</div>
"""
elif media_file.media_type == 'video':
media_section += f"""
<div class="media-item">
<h4>视频</h4>
<video controls style="max-width: 100%;">
<source src="/media/{media_file.local_file.name}" type="{media_file.mime_type}">
您的浏览器不支持视频播放。
</video>
<p><small>原始URL: <a href="{media_file.original_url}" target="_blank">{media_file.original_url}</a></small></p>
<p><small>文件大小: {media_file.file_size_display}</small></p>
</div>
"""
elif media_file.media_type == 'audio':
media_section += f"""
<div class="media-item">
<h4>音频</h4>
<audio controls>
<source src="/media/{media_file.local_file.name}" type="{media_file.mime_type}">
您的浏览器不支持音频播放。
</audio>
<p><small>原始URL: <a href="{media_file.original_url}" target="_blank">{media_file.original_url}</a></small></p>
<p><small>文件大小: {media_file.file_size_display}</small></p>
</div>
"""
else:
media_section += f"""
<div class="media-item">
<h4>文件: {media_file.get_media_type_display()}</h4>
<p><a href="/media/{media_file.local_file.name}" download>下载文件</a></p>
<p><small>原始URL: <a href="{media_file.original_url}" target="_blank">{media_file.original_url}</a></small></p>
<p><small>文件大小: {media_file.file_size_display}</small></p>
</div>
"""
media_section += " </div>"
# 处理内容格式,将换行符转换为段落和<br>标签
formatted_content = content.content.replace('\n\n', '</p><p>').replace('\n', '<br>')
# 动态生成预览页面
html_content = f"""
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>{content.title}</title>
<style>
body {{
font-family: Arial, sans-serif;
margin: 40px;
line-height: 1.6;
max-width: 1200px;
margin: 40px auto;
}}
h1 {{ color: #333; margin-bottom: 20px; }}
.meta {{
color: #666;
margin-bottom: 30px;
padding: 20px;
background-color: #f8f9fa;
border-radius: 8px;
border-left: 4px solid #007bff;
}}
.content {{
line-height: 1.8;
font-size: 16px;
margin-bottom: 30px;
}}
.content p {{
margin-bottom: 1em;
}}
.media-section {{
margin-top: 30px;
padding: 20px;
background-color: #f8f9fa;
border-radius: 8px;
}}
.media-item {{
margin-bottom: 20px;
padding: 15px;
border: 1px solid #ddd;
border-radius: 5px;
background-color: white;
}}
.media-item h4 {{
margin-top: 0;
color: #555;
border-bottom: 1px solid #eee;
padding-bottom: 10px;
}}
.back-link {{
margin-bottom: 20px;
}}
.back-link a {{
color: #007bff;
text-decoration: none;
font-weight: bold;
padding: 8px 16px;
background-color: #f8f9fa;
border: 1px solid #ddd;
border-radius: 4px;
}}
.back-link a:hover {{
text-decoration: underline;
background-color: #e9ecef;
}}
.navbar {{
background-color: #007bff;
padding: 15px;
margin-bottom: 30px;
border-radius: 8px;
}}
.navbar a {{
color: white;
text-decoration: none;
margin-right: 20px;
font-weight: bold;
}}
.navbar a:hover {{
text-decoration: underline;
}}
</style>
</head>
<body>
<div class="navbar">
<a href="/">仪表板</a>
<a href="/admin/crawler/crawledcontent/">管理界面</a>
<a href="javascript:history.back()">← 返回</a>
</div>
<h1>{content.title}</h1>
<div class="meta">
<p><strong>来源网站:</strong> {content.website.name} ({content.website.region})</p>
<p><strong>原始链接:</strong> <a href="{content.url}" target="_blank">{content.url}</a></p>
<p><strong>发布时间:</strong> {content.publish_date or '未知'}</p>
<p><strong>作者:</strong> {content.author or '未知'}</p>
<p><strong>匹配关键字:</strong> {content.keywords_matched}</p>
<p><strong>爬取时间:</strong> {content.created_at}</p>
<p><strong>媒体文件数量:</strong> {len(media_files)}</p>
</div>
<div class="content">
<p>{formatted_content}</p>
</div>
{media_section}
</body>
</html>
"""
return HttpResponse(html_content, content_type='text/html; charset=utf-8')
def download_crawled_content(request, content_id):
"""下载文章内容为压缩包包含Word文档和媒体文件"""
content = get_object_or_404(CrawledContent, id=content_id)
# 创建内存中的字节流用于存储zip文件
zip_buffer = BytesIO()
# 创建zip文件
with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
# 创建Word文档
doc = Document()
doc.add_heading(content.title, 0)
# 添加元数据
doc.add_paragraph(f'来源网站: {content.website.name} ({content.website.region})')
doc.add_paragraph(f'原始链接: {content.url}')
doc.add_paragraph(f'发布时间: {content.publish_date or "未知"}')
doc.add_paragraph(f'作者: {content.author or "未知"}')
doc.add_paragraph(f'匹配关键字: {content.keywords_matched}')
doc.add_paragraph(f'爬取时间: {content.created_at}')
# 添加内容
doc.add_heading('正文', level=1)
for paragraph in content.content.split('\n\n'):
if paragraph.strip():
doc.add_paragraph(paragraph.strip())
# 保存Word文档到内存
doc_buffer = BytesIO()
doc.save(doc_buffer)
doc_buffer.seek(0)
# 添加Word文档到zip文件
zip_file.writestr(f"{content.title[:50]}.docx", doc_buffer.getvalue())
# 添加媒体文件到zip文件
media_files = content.media_files.all()
for media_file in media_files:
try:
# 获取媒体文件的本地路径
if media_file.local_file and default_storage.exists(media_file.local_file.name):
# 读取文件内容
file_content = default_storage.open(media_file.local_file.name).read()
# 添加到zip文件中
zip_file.writestr(f"media/{os.path.basename(media_file.local_file.name)}", file_content)
except Exception as e:
# 如果文件无法读取,记录错误但继续处理其他文件
pass
# 准备响应
zip_buffer.seek(0)
response = HttpResponse(zip_buffer.getvalue(), content_type='application/zip')
filename = f"{content.title[:50]}.zip"
response['Content-Disposition'] = f'attachment; filename="{filename}"'
return response
def download_selected_contents(request):
"""下载选中的多篇文章内容为一个压缩包"""
if request.method == 'POST':
# 获取选中的文章ID
selected_ids = request.POST.getlist('selected_contents')
if not selected_ids:
# 如果没有选中任何文章,返回错误
return HttpResponse("请至少选择一篇文章", status=400)
# 获取选中的文章
contents = CrawledContent.objects.filter(id__in=selected_ids)
if not contents.exists():
return HttpResponse("未找到选中的文章", status=404)
# 创建内存中的字节流用于存储zip文件
zip_buffer = BytesIO()
# 创建zip文件
with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
for content in contents:
# 为每篇文章创建一个文件夹
folder_name = f"{content.title[:30].strip()}"
# 确保文件夹名称合法
folder_name = "".join(c for c in folder_name if c.isalnum() or c in (' ', '-', '_')).rstrip()
# 创建Word文档
doc = Document()
doc.add_heading(content.title, 0)
# 添加元数据
doc.add_paragraph(f'来源网站: {content.website.name} ({content.website.region})')
doc.add_paragraph(f'原始链接: {content.url}')
doc.add_paragraph(f'发布时间: {content.publish_date or "未知"}')
doc.add_paragraph(f'作者: {content.author or "未知"}')
doc.add_paragraph(f'匹配关键字: {content.keywords_matched}')
doc.add_paragraph(f'爬取时间: {content.created_at}')
# 添加内容
doc.add_heading('正文', level=1)
for paragraph in content.content.split('\n\n'):
if paragraph.strip():
doc.add_paragraph(paragraph.strip())
# 保存Word文档到内存
doc_buffer = BytesIO()
doc.save(doc_buffer)
doc_buffer.seek(0)
# 添加Word文档到zip文件
zip_file.writestr(f"{folder_name}/{content.title[:50]}.docx", doc_buffer.getvalue())
# 添加媒体文件到zip文件
media_files = content.media_files.all()
for media_file in media_files:
try:
# 获取媒体文件的本地路径
if media_file.local_file and default_storage.exists(media_file.local_file.name):
# 读取文件内容
file_content = default_storage.open(media_file.local_file.name).read()
# 添加到zip文件中
zip_file.writestr(f"{folder_name}/media/{os.path.basename(media_file.local_file.name)}", file_content)
except Exception as e:
# 如果文件无法读取,记录错误但继续处理其他文件
pass
# 准备响应
zip_buffer.seek(0)
response = HttpResponse(zip_buffer.getvalue(), content_type='application/zip')
response['Content-Disposition'] = 'attachment; filename="selected_articles.zip"'
return response
return HttpResponse("无效的请求方法", status=405)