435 lines
16 KiB
Python
435 lines
16 KiB
Python
from django.shortcuts import render, get_object_or_404
|
||
from django.http import HttpResponse, Http404
|
||
from django.db.models import Q, Count
|
||
from django.conf import settings
|
||
from django.utils import timezone
|
||
from django.core.files.storage import default_storage
|
||
from .models import Website, CrawlTask, CrawledContent, CrawlLog, SearchKeyword, MediaFile
|
||
from rest_framework import viewsets, filters
|
||
from rest_framework.decorators import action
|
||
from rest_framework.response import Response
|
||
from .serializers import (
|
||
WebsiteSerializer, CrawlTaskSerializer, CrawledContentSerializer,
|
||
CrawlLogSerializer, SearchKeywordSerializer
|
||
)
|
||
import json
|
||
from django.core.paginator import Paginator
|
||
from django.db.models.functions import TruncDate
|
||
from django.db.models import Count
|
||
import os
|
||
import tempfile
|
||
import zipfile
|
||
from io import BytesIO
|
||
from docx import Document
|
||
from django.core.files.base import ContentFile
|
||
|
||
|
||
def dashboard(request):
|
||
"""仪表板视图"""
|
||
# 统计数据
|
||
total_websites = Website.objects.filter(is_active=True).count()
|
||
total_tasks = CrawlTask.objects.count()
|
||
total_contents = CrawledContent.objects.count()
|
||
active_tasks = CrawlTask.objects.filter(status='running').count()
|
||
|
||
# 获取所有网站
|
||
websites = Website.objects.filter(is_active=True).order_by('name')
|
||
|
||
# 获取当前选中的网站ID
|
||
selected_website_id = request.GET.get('website')
|
||
|
||
# 获取分页参数
|
||
page_number = request.GET.get('page', 1)
|
||
page_size = request.GET.get('page_size', 20) # 默认每页20篇文章
|
||
|
||
# 尝试转换page_size为整数
|
||
try:
|
||
page_size = int(page_size)
|
||
# 限制page_size在合理范围内
|
||
page_size = max(10, min(100, page_size))
|
||
except (ValueError, TypeError):
|
||
page_size = 20
|
||
|
||
# 获取所有爬取的内容,按网站和创建时间排序
|
||
all_contents = CrawledContent.objects.select_related('website').order_by('website__name', '-created_at')
|
||
|
||
# 如果选择了特定网站,则进行过滤
|
||
if selected_website_id:
|
||
try:
|
||
selected_website_id = int(selected_website_id)
|
||
all_contents = all_contents.filter(website_id=selected_website_id)
|
||
except (ValueError, TypeError):
|
||
pass
|
||
|
||
# 分页处理
|
||
paginator = Paginator(all_contents, page_size)
|
||
page_obj = paginator.get_page(page_number)
|
||
|
||
# 按网站分组内容
|
||
contents_by_website = {}
|
||
for content in page_obj:
|
||
website_name = content.website.name
|
||
if website_name not in contents_by_website:
|
||
contents_by_website[website_name] = []
|
||
contents_by_website[website_name].append(content)
|
||
|
||
# 最近的任务
|
||
recent_tasks = CrawlTask.objects.order_by('-created_at')[:5]
|
||
|
||
# 媒体文件统计
|
||
total_media_files = CrawledContent.objects.aggregate(
|
||
total_media=Count('media_files')
|
||
)['total_media'] or 0
|
||
|
||
stats = {
|
||
'total_websites': total_websites,
|
||
'total_tasks': total_tasks,
|
||
'total_contents': total_contents,
|
||
'active_tasks': active_tasks,
|
||
'websites': websites,
|
||
'selected_website_id': selected_website_id,
|
||
'page_obj': page_obj,
|
||
'contents_by_website': contents_by_website,
|
||
'page_size': page_size,
|
||
'recent_tasks': recent_tasks,
|
||
'total_media_files': total_media_files,
|
||
}
|
||
|
||
return render(request, 'crawler/dashboard.html', {'stats': stats})
|
||
|
||
|
||
def search_page(request):
|
||
"""搜索页面视图"""
|
||
keyword = request.GET.get('q', '').strip()
|
||
contents = []
|
||
|
||
if keyword:
|
||
# 记录搜索关键字
|
||
SearchKeyword.objects.get_or_create(
|
||
keyword=keyword,
|
||
defaults={'last_used': timezone.now()}
|
||
)
|
||
|
||
# 搜索内容
|
||
contents = CrawledContent.objects.filter(
|
||
Q(title__icontains=keyword) |
|
||
Q(content__icontains=keyword) |
|
||
Q(keywords_matched__icontains=keyword)
|
||
).order_by('-created_at')[:50]
|
||
|
||
return render(request, 'crawler/search.html', {
|
||
'keyword': keyword,
|
||
'contents': contents
|
||
})
|
||
|
||
|
||
def preview_crawled_content(request, content_id):
|
||
"""预览爬取的内容"""
|
||
content = get_object_or_404(CrawledContent, id=content_id)
|
||
|
||
# 获取媒体文件
|
||
media_files = content.media_files.all()
|
||
|
||
# 生成媒体文件HTML
|
||
media_section = ""
|
||
if media_files:
|
||
media_section = """
|
||
<div class="media-section">
|
||
<h3>媒体文件</h3>
|
||
"""
|
||
for media_file in media_files:
|
||
if media_file.media_type == 'image':
|
||
media_section += f"""
|
||
<div class="media-item">
|
||
<h4>图片: {media_file.alt_text or '无标题'}</h4>
|
||
<img src="/media/{media_file.local_file.name}" alt="{media_file.alt_text}" style="max-width: 100%; height: auto;">
|
||
<p><small>原始URL: <a href="{media_file.original_url}" target="_blank">{media_file.original_url}</a></small></p>
|
||
<p><small>文件大小: {media_file.file_size_display}</small></p>
|
||
</div>
|
||
"""
|
||
elif media_file.media_type == 'video':
|
||
media_section += f"""
|
||
<div class="media-item">
|
||
<h4>视频</h4>
|
||
<video controls style="max-width: 100%;">
|
||
<source src="/media/{media_file.local_file.name}" type="{media_file.mime_type}">
|
||
您的浏览器不支持视频播放。
|
||
</video>
|
||
<p><small>原始URL: <a href="{media_file.original_url}" target="_blank">{media_file.original_url}</a></small></p>
|
||
<p><small>文件大小: {media_file.file_size_display}</small></p>
|
||
</div>
|
||
"""
|
||
elif media_file.media_type == 'audio':
|
||
media_section += f"""
|
||
<div class="media-item">
|
||
<h4>音频</h4>
|
||
<audio controls>
|
||
<source src="/media/{media_file.local_file.name}" type="{media_file.mime_type}">
|
||
您的浏览器不支持音频播放。
|
||
</audio>
|
||
<p><small>原始URL: <a href="{media_file.original_url}" target="_blank">{media_file.original_url}</a></small></p>
|
||
<p><small>文件大小: {media_file.file_size_display}</small></p>
|
||
</div>
|
||
"""
|
||
else:
|
||
media_section += f"""
|
||
<div class="media-item">
|
||
<h4>文件: {media_file.get_media_type_display()}</h4>
|
||
<p><a href="/media/{media_file.local_file.name}" download>下载文件</a></p>
|
||
<p><small>原始URL: <a href="{media_file.original_url}" target="_blank">{media_file.original_url}</a></small></p>
|
||
<p><small>文件大小: {media_file.file_size_display}</small></p>
|
||
</div>
|
||
"""
|
||
media_section += " </div>"
|
||
|
||
# 处理内容格式,将换行符转换为段落和<br>标签
|
||
formatted_content = content.content.replace('\n\n', '</p><p>').replace('\n', '<br>')
|
||
|
||
# 动态生成预览页面
|
||
html_content = f"""
|
||
<!DOCTYPE html>
|
||
<html>
|
||
<head>
|
||
<meta charset="utf-8">
|
||
<title>{content.title}</title>
|
||
<style>
|
||
body {{
|
||
font-family: Arial, sans-serif;
|
||
margin: 40px;
|
||
line-height: 1.6;
|
||
max-width: 1200px;
|
||
margin: 40px auto;
|
||
}}
|
||
h1 {{ color: #333; margin-bottom: 20px; }}
|
||
.meta {{
|
||
color: #666;
|
||
margin-bottom: 30px;
|
||
padding: 20px;
|
||
background-color: #f8f9fa;
|
||
border-radius: 8px;
|
||
border-left: 4px solid #007bff;
|
||
}}
|
||
.content {{
|
||
line-height: 1.8;
|
||
font-size: 16px;
|
||
margin-bottom: 30px;
|
||
}}
|
||
.content p {{
|
||
margin-bottom: 1em;
|
||
}}
|
||
.media-section {{
|
||
margin-top: 30px;
|
||
padding: 20px;
|
||
background-color: #f8f9fa;
|
||
border-radius: 8px;
|
||
}}
|
||
.media-item {{
|
||
margin-bottom: 20px;
|
||
padding: 15px;
|
||
border: 1px solid #ddd;
|
||
border-radius: 5px;
|
||
background-color: white;
|
||
}}
|
||
.media-item h4 {{
|
||
margin-top: 0;
|
||
color: #555;
|
||
border-bottom: 1px solid #eee;
|
||
padding-bottom: 10px;
|
||
}}
|
||
.back-link {{
|
||
margin-bottom: 20px;
|
||
}}
|
||
.back-link a {{
|
||
color: #007bff;
|
||
text-decoration: none;
|
||
font-weight: bold;
|
||
padding: 8px 16px;
|
||
background-color: #f8f9fa;
|
||
border: 1px solid #ddd;
|
||
border-radius: 4px;
|
||
}}
|
||
.back-link a:hover {{
|
||
text-decoration: underline;
|
||
background-color: #e9ecef;
|
||
}}
|
||
.navbar {{
|
||
background-color: #007bff;
|
||
padding: 15px;
|
||
margin-bottom: 30px;
|
||
border-radius: 8px;
|
||
}}
|
||
.navbar a {{
|
||
color: white;
|
||
text-decoration: none;
|
||
margin-right: 20px;
|
||
font-weight: bold;
|
||
}}
|
||
.navbar a:hover {{
|
||
text-decoration: underline;
|
||
}}
|
||
</style>
|
||
</head>
|
||
<body>
|
||
<div class="navbar">
|
||
<a href="/">仪表板</a>
|
||
<a href="/admin/crawler/crawledcontent/">管理界面</a>
|
||
<a href="javascript:history.back()">← 返回</a>
|
||
</div>
|
||
|
||
<h1>{content.title}</h1>
|
||
|
||
<div class="meta">
|
||
<p><strong>来源网站:</strong> {content.website.name} ({content.website.region})</p>
|
||
<p><strong>原始链接:</strong> <a href="{content.url}" target="_blank">{content.url}</a></p>
|
||
<p><strong>发布时间:</strong> {content.publish_date or '未知'}</p>
|
||
<p><strong>作者:</strong> {content.author or '未知'}</p>
|
||
<p><strong>匹配关键字:</strong> {content.keywords_matched}</p>
|
||
<p><strong>爬取时间:</strong> {content.created_at}</p>
|
||
<p><strong>媒体文件数量:</strong> {len(media_files)}</p>
|
||
</div>
|
||
|
||
<div class="content">
|
||
<p>{formatted_content}</p>
|
||
</div>
|
||
|
||
{media_section}
|
||
</body>
|
||
</html>
|
||
"""
|
||
return HttpResponse(html_content, content_type='text/html; charset=utf-8')
|
||
|
||
|
||
def download_crawled_content(request, content_id):
|
||
"""下载文章内容为压缩包(包含Word文档和媒体文件)"""
|
||
content = get_object_or_404(CrawledContent, id=content_id)
|
||
|
||
# 创建内存中的字节流用于存储zip文件
|
||
zip_buffer = BytesIO()
|
||
|
||
# 创建zip文件
|
||
with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
|
||
# 创建Word文档
|
||
doc = Document()
|
||
doc.add_heading(content.title, 0)
|
||
|
||
# 添加元数据
|
||
doc.add_paragraph(f'来源网站: {content.website.name} ({content.website.region})')
|
||
doc.add_paragraph(f'原始链接: {content.url}')
|
||
doc.add_paragraph(f'发布时间: {content.publish_date or "未知"}')
|
||
doc.add_paragraph(f'作者: {content.author or "未知"}')
|
||
doc.add_paragraph(f'匹配关键字: {content.keywords_matched}')
|
||
doc.add_paragraph(f'爬取时间: {content.created_at}')
|
||
|
||
# 添加内容
|
||
doc.add_heading('正文', level=1)
|
||
for paragraph in content.content.split('\n\n'):
|
||
if paragraph.strip():
|
||
doc.add_paragraph(paragraph.strip())
|
||
|
||
# 保存Word文档到内存
|
||
doc_buffer = BytesIO()
|
||
doc.save(doc_buffer)
|
||
doc_buffer.seek(0)
|
||
|
||
# 添加Word文档到zip文件
|
||
zip_file.writestr(f"{content.title[:50]}.docx", doc_buffer.getvalue())
|
||
|
||
# 添加媒体文件到zip文件
|
||
media_files = content.media_files.all()
|
||
for media_file in media_files:
|
||
try:
|
||
# 获取媒体文件的本地路径
|
||
if media_file.local_file and default_storage.exists(media_file.local_file.name):
|
||
# 读取文件内容
|
||
file_content = default_storage.open(media_file.local_file.name).read()
|
||
# 添加到zip文件中
|
||
zip_file.writestr(f"media/{os.path.basename(media_file.local_file.name)}", file_content)
|
||
except Exception as e:
|
||
# 如果文件无法读取,记录错误但继续处理其他文件
|
||
pass
|
||
|
||
# 准备响应
|
||
zip_buffer.seek(0)
|
||
response = HttpResponse(zip_buffer.getvalue(), content_type='application/zip')
|
||
filename = f"{content.title[:50]}.zip"
|
||
response['Content-Disposition'] = f'attachment; filename="{filename}"'
|
||
|
||
return response
|
||
|
||
|
||
def download_selected_contents(request):
|
||
"""下载选中的多篇文章内容为一个压缩包"""
|
||
if request.method == 'POST':
|
||
# 获取选中的文章ID
|
||
selected_ids = request.POST.getlist('selected_contents')
|
||
|
||
if not selected_ids:
|
||
# 如果没有选中任何文章,返回错误
|
||
return HttpResponse("请至少选择一篇文章", status=400)
|
||
|
||
# 获取选中的文章
|
||
contents = CrawledContent.objects.filter(id__in=selected_ids)
|
||
|
||
if not contents.exists():
|
||
return HttpResponse("未找到选中的文章", status=404)
|
||
|
||
# 创建内存中的字节流用于存储zip文件
|
||
zip_buffer = BytesIO()
|
||
|
||
# 创建zip文件
|
||
with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
|
||
for content in contents:
|
||
# 为每篇文章创建一个文件夹
|
||
folder_name = f"{content.title[:30].strip()}"
|
||
# 确保文件夹名称合法
|
||
folder_name = "".join(c for c in folder_name if c.isalnum() or c in (' ', '-', '_')).rstrip()
|
||
|
||
# 创建Word文档
|
||
doc = Document()
|
||
doc.add_heading(content.title, 0)
|
||
|
||
# 添加元数据
|
||
doc.add_paragraph(f'来源网站: {content.website.name} ({content.website.region})')
|
||
doc.add_paragraph(f'原始链接: {content.url}')
|
||
doc.add_paragraph(f'发布时间: {content.publish_date or "未知"}')
|
||
doc.add_paragraph(f'作者: {content.author or "未知"}')
|
||
doc.add_paragraph(f'匹配关键字: {content.keywords_matched}')
|
||
doc.add_paragraph(f'爬取时间: {content.created_at}')
|
||
|
||
# 添加内容
|
||
doc.add_heading('正文', level=1)
|
||
for paragraph in content.content.split('\n\n'):
|
||
if paragraph.strip():
|
||
doc.add_paragraph(paragraph.strip())
|
||
|
||
# 保存Word文档到内存
|
||
doc_buffer = BytesIO()
|
||
doc.save(doc_buffer)
|
||
doc_buffer.seek(0)
|
||
|
||
# 添加Word文档到zip文件
|
||
zip_file.writestr(f"{folder_name}/{content.title[:50]}.docx", doc_buffer.getvalue())
|
||
|
||
# 添加媒体文件到zip文件
|
||
media_files = content.media_files.all()
|
||
for media_file in media_files:
|
||
try:
|
||
# 获取媒体文件的本地路径
|
||
if media_file.local_file and default_storage.exists(media_file.local_file.name):
|
||
# 读取文件内容
|
||
file_content = default_storage.open(media_file.local_file.name).read()
|
||
# 添加到zip文件中
|
||
zip_file.writestr(f"{folder_name}/media/{os.path.basename(media_file.local_file.name)}", file_content)
|
||
except Exception as e:
|
||
# 如果文件无法读取,记录错误但继续处理其他文件
|
||
pass
|
||
|
||
# 准备响应
|
||
zip_buffer.seek(0)
|
||
response = HttpResponse(zip_buffer.getvalue(), content_type='application/zip')
|
||
response['Content-Disposition'] = 'attachment; filename="selected_articles.zip"'
|
||
|
||
return response
|
||
|
||
return HttpResponse("无效的请求方法", status=405)
|