292 lines
9.8 KiB
Python
292 lines
9.8 KiB
Python
from django.shortcuts import render, get_object_or_404
|
|
from django.http import HttpResponse
|
|
from django.db.models import Q, Count
|
|
from django.conf import settings
|
|
from django.utils import timezone
|
|
from .models import Website, CrawlTask, CrawledContent, CrawlLog, SearchKeyword
|
|
from rest_framework import viewsets, filters
|
|
from rest_framework.decorators import action
|
|
from rest_framework.response import Response
|
|
from .serializers import (
|
|
WebsiteSerializer, CrawlTaskSerializer, CrawledContentSerializer,
|
|
CrawlLogSerializer, SearchKeywordSerializer
|
|
)
|
|
import json
|
|
from django.core.paginator import Paginator
|
|
from django.db.models.functions import TruncDate
|
|
from django.db.models import Count
|
|
|
|
|
|
def dashboard(request):
|
|
"""仪表板视图"""
|
|
# 统计数据
|
|
total_websites = Website.objects.filter(is_active=True).count()
|
|
total_tasks = CrawlTask.objects.count()
|
|
total_contents = CrawledContent.objects.count()
|
|
active_tasks = CrawlTask.objects.filter(status='running').count()
|
|
|
|
# 获取所有网站
|
|
websites = Website.objects.filter(is_active=True).order_by('name')
|
|
|
|
# 获取当前选中的网站ID
|
|
selected_website_id = request.GET.get('website')
|
|
|
|
# 获取分页参数
|
|
page_number = request.GET.get('page', 1)
|
|
page_size = request.GET.get('page_size', 20) # 默认每页20篇文章
|
|
|
|
# 尝试转换page_size为整数
|
|
try:
|
|
page_size = int(page_size)
|
|
# 限制page_size在合理范围内
|
|
page_size = max(10, min(100, page_size))
|
|
except (ValueError, TypeError):
|
|
page_size = 20
|
|
|
|
# 获取所有爬取的内容,按网站和创建时间排序
|
|
all_contents = CrawledContent.objects.select_related('website').order_by('website__name', '-created_at')
|
|
|
|
# 如果选择了特定网站,则进行过滤
|
|
if selected_website_id:
|
|
try:
|
|
selected_website_id = int(selected_website_id)
|
|
all_contents = all_contents.filter(website_id=selected_website_id)
|
|
except (ValueError, TypeError):
|
|
pass
|
|
|
|
# 分页处理
|
|
paginator = Paginator(all_contents, page_size)
|
|
page_obj = paginator.get_page(page_number)
|
|
|
|
# 按网站分组内容
|
|
contents_by_website = {}
|
|
for content in page_obj:
|
|
website_name = content.website.name
|
|
if website_name not in contents_by_website:
|
|
contents_by_website[website_name] = []
|
|
contents_by_website[website_name].append(content)
|
|
|
|
# 最近的任务
|
|
recent_tasks = CrawlTask.objects.order_by('-created_at')[:5]
|
|
|
|
# 媒体文件统计
|
|
total_media_files = CrawledContent.objects.aggregate(
|
|
total_media=Count('media_files')
|
|
)['total_media'] or 0
|
|
|
|
stats = {
|
|
'total_websites': total_websites,
|
|
'total_tasks': total_tasks,
|
|
'total_contents': total_contents,
|
|
'active_tasks': active_tasks,
|
|
'websites': websites,
|
|
'selected_website_id': selected_website_id,
|
|
'page_obj': page_obj,
|
|
'contents_by_website': contents_by_website,
|
|
'page_size': page_size,
|
|
'recent_tasks': recent_tasks,
|
|
'total_media_files': total_media_files,
|
|
}
|
|
|
|
return render(request, 'crawler/dashboard.html', {'stats': stats})
|
|
|
|
|
|
def search_page(request):
|
|
"""搜索页面视图"""
|
|
keyword = request.GET.get('q', '').strip()
|
|
contents = []
|
|
|
|
if keyword:
|
|
# 记录搜索关键字
|
|
SearchKeyword.objects.get_or_create(
|
|
keyword=keyword,
|
|
defaults={'last_used': timezone.now()}
|
|
)
|
|
|
|
# 搜索内容
|
|
contents = CrawledContent.objects.filter(
|
|
Q(title__icontains=keyword) |
|
|
Q(content__icontains=keyword) |
|
|
Q(keywords_matched__icontains=keyword)
|
|
).order_by('-created_at')[:50]
|
|
|
|
return render(request, 'crawler/search.html', {
|
|
'keyword': keyword,
|
|
'contents': contents
|
|
})
|
|
|
|
|
|
def preview_crawled_content(request, content_id):
|
|
"""预览爬取的内容"""
|
|
content = get_object_or_404(CrawledContent, id=content_id)
|
|
|
|
# 获取媒体文件
|
|
media_files = content.media_files.all()
|
|
|
|
# 生成媒体文件HTML
|
|
media_section = ""
|
|
if media_files:
|
|
media_section = """
|
|
<div class="media-section">
|
|
<h3>媒体文件</h3>
|
|
"""
|
|
for media_file in media_files:
|
|
if media_file.media_type == 'image':
|
|
media_section += f"""
|
|
<div class="media-item">
|
|
<h4>图片: {media_file.alt_text or '无标题'}</h4>
|
|
<img src="/media/{media_file.local_file.name}" alt="{media_file.alt_text}" style="max-width: 100%; height: auto;">
|
|
<p><small>原始URL: <a href="{media_file.original_url}" target="_blank">{media_file.original_url}</a></small></p>
|
|
<p><small>文件大小: {media_file.file_size_display}</small></p>
|
|
</div>
|
|
"""
|
|
elif media_file.media_type == 'video':
|
|
media_section += f"""
|
|
<div class="media-item">
|
|
<h4>视频</h4>
|
|
<video controls style="max-width: 100%;">
|
|
<source src="/media/{media_file.local_file.name}" type="{media_file.mime_type}">
|
|
您的浏览器不支持视频播放。
|
|
</video>
|
|
<p><small>原始URL: <a href="{media_file.original_url}" target="_blank">{media_file.original_url}</a></small></p>
|
|
<p><small>文件大小: {media_file.file_size_display}</small></p>
|
|
</div>
|
|
"""
|
|
elif media_file.media_type == 'audio':
|
|
media_section += f"""
|
|
<div class="media-item">
|
|
<h4>音频</h4>
|
|
<audio controls>
|
|
<source src="/media/{media_file.local_file.name}" type="{media_file.mime_type}">
|
|
您的浏览器不支持音频播放。
|
|
</audio>
|
|
<p><small>原始URL: <a href="{media_file.original_url}" target="_blank">{media_file.original_url}</a></small></p>
|
|
<p><small>文件大小: {media_file.file_size_display}</small></p>
|
|
</div>
|
|
"""
|
|
else:
|
|
media_section += f"""
|
|
<div class="media-item">
|
|
<h4>文件: {media_file.get_media_type_display()}</h4>
|
|
<p><a href="/media/{media_file.local_file.name}" download>下载文件</a></p>
|
|
<p><small>原始URL: <a href="{media_file.original_url}" target="_blank">{media_file.original_url}</a></small></p>
|
|
<p><small>文件大小: {media_file.file_size_display}</small></p>
|
|
</div>
|
|
"""
|
|
media_section += " </div>"
|
|
|
|
# 处理内容格式,将换行符转换为段落和<br>标签
|
|
formatted_content = content.content.replace('\n\n', '</p><p>').replace('\n', '<br>')
|
|
|
|
# 动态生成预览页面
|
|
html_content = f"""
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head>
|
|
<meta charset="utf-8">
|
|
<title>{content.title}</title>
|
|
<style>
|
|
body {{
|
|
font-family: Arial, sans-serif;
|
|
margin: 40px;
|
|
line-height: 1.6;
|
|
max-width: 1200px;
|
|
margin: 40px auto;
|
|
}}
|
|
h1 {{ color: #333; margin-bottom: 20px; }}
|
|
.meta {{
|
|
color: #666;
|
|
margin-bottom: 30px;
|
|
padding: 20px;
|
|
background-color: #f8f9fa;
|
|
border-radius: 8px;
|
|
border-left: 4px solid #007bff;
|
|
}}
|
|
.content {{
|
|
line-height: 1.8;
|
|
font-size: 16px;
|
|
margin-bottom: 30px;
|
|
}}
|
|
.content p {{
|
|
margin-bottom: 1em;
|
|
}}
|
|
.media-section {{
|
|
margin-top: 30px;
|
|
padding: 20px;
|
|
background-color: #f8f9fa;
|
|
border-radius: 8px;
|
|
}}
|
|
.media-item {{
|
|
margin-bottom: 20px;
|
|
padding: 15px;
|
|
border: 1px solid #ddd;
|
|
border-radius: 5px;
|
|
background-color: white;
|
|
}}
|
|
.media-item h4 {{
|
|
margin-top: 0;
|
|
color: #555;
|
|
border-bottom: 1px solid #eee;
|
|
padding-bottom: 10px;
|
|
}}
|
|
.back-link {{
|
|
margin-bottom: 20px;
|
|
}}
|
|
.back-link a {{
|
|
color: #007bff;
|
|
text-decoration: none;
|
|
font-weight: bold;
|
|
padding: 8px 16px;
|
|
background-color: #f8f9fa;
|
|
border: 1px solid #ddd;
|
|
border-radius: 4px;
|
|
}}
|
|
.back-link a:hover {{
|
|
text-decoration: underline;
|
|
background-color: #e9ecef;
|
|
}}
|
|
.navbar {{
|
|
background-color: #007bff;
|
|
padding: 15px;
|
|
margin-bottom: 30px;
|
|
border-radius: 8px;
|
|
}}
|
|
.navbar a {{
|
|
color: white;
|
|
text-decoration: none;
|
|
margin-right: 20px;
|
|
font-weight: bold;
|
|
}}
|
|
.navbar a:hover {{
|
|
text-decoration: underline;
|
|
}}
|
|
</style>
|
|
</head>
|
|
<body>
|
|
<div class="navbar">
|
|
<a href="/">仪表板</a>
|
|
<a href="/admin/crawler/crawledcontent/">管理界面</a>
|
|
<a href="javascript:history.back()">← 返回</a>
|
|
</div>
|
|
|
|
<h1>{content.title}</h1>
|
|
|
|
<div class="meta">
|
|
<p><strong>来源网站:</strong> {content.website.name} ({content.website.region})</p>
|
|
<p><strong>原始链接:</strong> <a href="{content.url}" target="_blank">{content.url}</a></p>
|
|
<p><strong>发布时间:</strong> {content.publish_date or '未知'}</p>
|
|
<p><strong>作者:</strong> {content.author or '未知'}</p>
|
|
<p><strong>匹配关键字:</strong> {content.keywords_matched}</p>
|
|
<p><strong>爬取时间:</strong> {content.created_at}</p>
|
|
<p><strong>媒体文件数量:</strong> {len(media_files)}</p>
|
|
</div>
|
|
|
|
<div class="content">
|
|
<p>{formatted_content}</p>
|
|
</div>
|
|
|
|
{media_section}
|
|
</body>
|
|
</html>
|
|
"""
|
|
return HttpResponse(html_content, content_type='text/html; charset=utf-8') |