Files
icac/crawler/views.py
2025-09-23 13:30:03 +08:00

292 lines
9.8 KiB
Python

from django.shortcuts import render, get_object_or_404
from django.http import HttpResponse
from django.db.models import Q, Count
from django.conf import settings
from django.utils import timezone
from .models import Website, CrawlTask, CrawledContent, CrawlLog, SearchKeyword
from rest_framework import viewsets, filters
from rest_framework.decorators import action
from rest_framework.response import Response
from .serializers import (
WebsiteSerializer, CrawlTaskSerializer, CrawledContentSerializer,
CrawlLogSerializer, SearchKeywordSerializer
)
import json
from django.core.paginator import Paginator
from django.db.models.functions import TruncDate
from django.db.models import Count
def dashboard(request):
"""仪表板视图"""
# 统计数据
total_websites = Website.objects.filter(is_active=True).count()
total_tasks = CrawlTask.objects.count()
total_contents = CrawledContent.objects.count()
active_tasks = CrawlTask.objects.filter(status='running').count()
# 获取所有网站
websites = Website.objects.filter(is_active=True).order_by('name')
# 获取当前选中的网站ID
selected_website_id = request.GET.get('website')
# 获取分页参数
page_number = request.GET.get('page', 1)
page_size = request.GET.get('page_size', 20) # 默认每页20篇文章
# 尝试转换page_size为整数
try:
page_size = int(page_size)
# 限制page_size在合理范围内
page_size = max(10, min(100, page_size))
except (ValueError, TypeError):
page_size = 20
# 获取所有爬取的内容,按网站和创建时间排序
all_contents = CrawledContent.objects.select_related('website').order_by('website__name', '-created_at')
# 如果选择了特定网站,则进行过滤
if selected_website_id:
try:
selected_website_id = int(selected_website_id)
all_contents = all_contents.filter(website_id=selected_website_id)
except (ValueError, TypeError):
pass
# 分页处理
paginator = Paginator(all_contents, page_size)
page_obj = paginator.get_page(page_number)
# 按网站分组内容
contents_by_website = {}
for content in page_obj:
website_name = content.website.name
if website_name not in contents_by_website:
contents_by_website[website_name] = []
contents_by_website[website_name].append(content)
# 最近的任务
recent_tasks = CrawlTask.objects.order_by('-created_at')[:5]
# 媒体文件统计
total_media_files = CrawledContent.objects.aggregate(
total_media=Count('media_files')
)['total_media'] or 0
stats = {
'total_websites': total_websites,
'total_tasks': total_tasks,
'total_contents': total_contents,
'active_tasks': active_tasks,
'websites': websites,
'selected_website_id': selected_website_id,
'page_obj': page_obj,
'contents_by_website': contents_by_website,
'page_size': page_size,
'recent_tasks': recent_tasks,
'total_media_files': total_media_files,
}
return render(request, 'crawler/dashboard.html', {'stats': stats})
def search_page(request):
"""搜索页面视图"""
keyword = request.GET.get('q', '').strip()
contents = []
if keyword:
# 记录搜索关键字
SearchKeyword.objects.get_or_create(
keyword=keyword,
defaults={'last_used': timezone.now()}
)
# 搜索内容
contents = CrawledContent.objects.filter(
Q(title__icontains=keyword) |
Q(content__icontains=keyword) |
Q(keywords_matched__icontains=keyword)
).order_by('-created_at')[:50]
return render(request, 'crawler/search.html', {
'keyword': keyword,
'contents': contents
})
def preview_crawled_content(request, content_id):
"""预览爬取的内容"""
content = get_object_or_404(CrawledContent, id=content_id)
# 获取媒体文件
media_files = content.media_files.all()
# 生成媒体文件HTML
media_section = ""
if media_files:
media_section = """
<div class="media-section">
<h3>媒体文件</h3>
"""
for media_file in media_files:
if media_file.media_type == 'image':
media_section += f"""
<div class="media-item">
<h4>图片: {media_file.alt_text or '无标题'}</h4>
<img src="/media/{media_file.local_file.name}" alt="{media_file.alt_text}" style="max-width: 100%; height: auto;">
<p><small>原始URL: <a href="{media_file.original_url}" target="_blank">{media_file.original_url}</a></small></p>
<p><small>文件大小: {media_file.file_size_display}</small></p>
</div>
"""
elif media_file.media_type == 'video':
media_section += f"""
<div class="media-item">
<h4>视频</h4>
<video controls style="max-width: 100%;">
<source src="/media/{media_file.local_file.name}" type="{media_file.mime_type}">
您的浏览器不支持视频播放。
</video>
<p><small>原始URL: <a href="{media_file.original_url}" target="_blank">{media_file.original_url}</a></small></p>
<p><small>文件大小: {media_file.file_size_display}</small></p>
</div>
"""
elif media_file.media_type == 'audio':
media_section += f"""
<div class="media-item">
<h4>音频</h4>
<audio controls>
<source src="/media/{media_file.local_file.name}" type="{media_file.mime_type}">
您的浏览器不支持音频播放。
</audio>
<p><small>原始URL: <a href="{media_file.original_url}" target="_blank">{media_file.original_url}</a></small></p>
<p><small>文件大小: {media_file.file_size_display}</small></p>
</div>
"""
else:
media_section += f"""
<div class="media-item">
<h4>文件: {media_file.get_media_type_display()}</h4>
<p><a href="/media/{media_file.local_file.name}" download>下载文件</a></p>
<p><small>原始URL: <a href="{media_file.original_url}" target="_blank">{media_file.original_url}</a></small></p>
<p><small>文件大小: {media_file.file_size_display}</small></p>
</div>
"""
media_section += " </div>"
# 处理内容格式,将换行符转换为段落和<br>标签
formatted_content = content.content.replace('\n\n', '</p><p>').replace('\n', '<br>')
# 动态生成预览页面
html_content = f"""
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>{content.title}</title>
<style>
body {{
font-family: Arial, sans-serif;
margin: 40px;
line-height: 1.6;
max-width: 1200px;
margin: 40px auto;
}}
h1 {{ color: #333; margin-bottom: 20px; }}
.meta {{
color: #666;
margin-bottom: 30px;
padding: 20px;
background-color: #f8f9fa;
border-radius: 8px;
border-left: 4px solid #007bff;
}}
.content {{
line-height: 1.8;
font-size: 16px;
margin-bottom: 30px;
}}
.content p {{
margin-bottom: 1em;
}}
.media-section {{
margin-top: 30px;
padding: 20px;
background-color: #f8f9fa;
border-radius: 8px;
}}
.media-item {{
margin-bottom: 20px;
padding: 15px;
border: 1px solid #ddd;
border-radius: 5px;
background-color: white;
}}
.media-item h4 {{
margin-top: 0;
color: #555;
border-bottom: 1px solid #eee;
padding-bottom: 10px;
}}
.back-link {{
margin-bottom: 20px;
}}
.back-link a {{
color: #007bff;
text-decoration: none;
font-weight: bold;
padding: 8px 16px;
background-color: #f8f9fa;
border: 1px solid #ddd;
border-radius: 4px;
}}
.back-link a:hover {{
text-decoration: underline;
background-color: #e9ecef;
}}
.navbar {{
background-color: #007bff;
padding: 15px;
margin-bottom: 30px;
border-radius: 8px;
}}
.navbar a {{
color: white;
text-decoration: none;
margin-right: 20px;
font-weight: bold;
}}
.navbar a:hover {{
text-decoration: underline;
}}
</style>
</head>
<body>
<div class="navbar">
<a href="/">仪表板</a>
<a href="/admin/crawler/crawledcontent/">管理界面</a>
<a href="javascript:history.back()">← 返回</a>
</div>
<h1>{content.title}</h1>
<div class="meta">
<p><strong>来源网站:</strong> {content.website.name} ({content.website.region})</p>
<p><strong>原始链接:</strong> <a href="{content.url}" target="_blank">{content.url}</a></p>
<p><strong>发布时间:</strong> {content.publish_date or '未知'}</p>
<p><strong>作者:</strong> {content.author or '未知'}</p>
<p><strong>匹配关键字:</strong> {content.keywords_matched}</p>
<p><strong>爬取时间:</strong> {content.created_at}</p>
<p><strong>媒体文件数量:</strong> {len(media_files)}</p>
</div>
<div class="content">
<p>{formatted_content}</p>
</div>
{media_section}
</body>
</html>
"""
return HttpResponse(html_content, content_type='text/html; charset=utf-8')