Base setup
This commit is contained in:
292
crawler/views.py
Normal file
292
crawler/views.py
Normal file
@@ -0,0 +1,292 @@
|
||||
from django.shortcuts import render, get_object_or_404
|
||||
from django.http import HttpResponse
|
||||
from django.db.models import Q, Count
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
from .models import Website, CrawlTask, CrawledContent, CrawlLog, SearchKeyword
|
||||
from rest_framework import viewsets, filters
|
||||
from rest_framework.decorators import action
|
||||
from rest_framework.response import Response
|
||||
from .serializers import (
|
||||
WebsiteSerializer, CrawlTaskSerializer, CrawledContentSerializer,
|
||||
CrawlLogSerializer, SearchKeywordSerializer
|
||||
)
|
||||
import json
|
||||
from django.core.paginator import Paginator
|
||||
from django.db.models.functions import TruncDate
|
||||
from django.db.models import Count
|
||||
|
||||
|
||||
def dashboard(request):
|
||||
"""仪表板视图"""
|
||||
# 统计数据
|
||||
total_websites = Website.objects.filter(is_active=True).count()
|
||||
total_tasks = CrawlTask.objects.count()
|
||||
total_contents = CrawledContent.objects.count()
|
||||
active_tasks = CrawlTask.objects.filter(status='running').count()
|
||||
|
||||
# 获取所有网站
|
||||
websites = Website.objects.filter(is_active=True).order_by('name')
|
||||
|
||||
# 获取当前选中的网站ID
|
||||
selected_website_id = request.GET.get('website')
|
||||
|
||||
# 获取分页参数
|
||||
page_number = request.GET.get('page', 1)
|
||||
page_size = request.GET.get('page_size', 20) # 默认每页20篇文章
|
||||
|
||||
# 尝试转换page_size为整数
|
||||
try:
|
||||
page_size = int(page_size)
|
||||
# 限制page_size在合理范围内
|
||||
page_size = max(10, min(100, page_size))
|
||||
except (ValueError, TypeError):
|
||||
page_size = 20
|
||||
|
||||
# 获取所有爬取的内容,按网站和创建时间排序
|
||||
all_contents = CrawledContent.objects.select_related('website').order_by('website__name', '-created_at')
|
||||
|
||||
# 如果选择了特定网站,则进行过滤
|
||||
if selected_website_id:
|
||||
try:
|
||||
selected_website_id = int(selected_website_id)
|
||||
all_contents = all_contents.filter(website_id=selected_website_id)
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
# 分页处理
|
||||
paginator = Paginator(all_contents, page_size)
|
||||
page_obj = paginator.get_page(page_number)
|
||||
|
||||
# 按网站分组内容
|
||||
contents_by_website = {}
|
||||
for content in page_obj:
|
||||
website_name = content.website.name
|
||||
if website_name not in contents_by_website:
|
||||
contents_by_website[website_name] = []
|
||||
contents_by_website[website_name].append(content)
|
||||
|
||||
# 最近的任务
|
||||
recent_tasks = CrawlTask.objects.order_by('-created_at')[:5]
|
||||
|
||||
# 媒体文件统计
|
||||
total_media_files = CrawledContent.objects.aggregate(
|
||||
total_media=Count('media_files')
|
||||
)['total_media'] or 0
|
||||
|
||||
stats = {
|
||||
'total_websites': total_websites,
|
||||
'total_tasks': total_tasks,
|
||||
'total_contents': total_contents,
|
||||
'active_tasks': active_tasks,
|
||||
'websites': websites,
|
||||
'selected_website_id': selected_website_id,
|
||||
'page_obj': page_obj,
|
||||
'contents_by_website': contents_by_website,
|
||||
'page_size': page_size,
|
||||
'recent_tasks': recent_tasks,
|
||||
'total_media_files': total_media_files,
|
||||
}
|
||||
|
||||
return render(request, 'crawler/dashboard.html', {'stats': stats})
|
||||
|
||||
|
||||
def search_page(request):
|
||||
"""搜索页面视图"""
|
||||
keyword = request.GET.get('q', '').strip()
|
||||
contents = []
|
||||
|
||||
if keyword:
|
||||
# 记录搜索关键字
|
||||
SearchKeyword.objects.get_or_create(
|
||||
keyword=keyword,
|
||||
defaults={'last_used': timezone.now()}
|
||||
)
|
||||
|
||||
# 搜索内容
|
||||
contents = CrawledContent.objects.filter(
|
||||
Q(title__icontains=keyword) |
|
||||
Q(content__icontains=keyword) |
|
||||
Q(keywords_matched__icontains=keyword)
|
||||
).order_by('-created_at')[:50]
|
||||
|
||||
return render(request, 'crawler/search.html', {
|
||||
'keyword': keyword,
|
||||
'contents': contents
|
||||
})
|
||||
|
||||
|
||||
def preview_crawled_content(request, content_id):
|
||||
"""预览爬取的内容"""
|
||||
content = get_object_or_404(CrawledContent, id=content_id)
|
||||
|
||||
# 获取媒体文件
|
||||
media_files = content.media_files.all()
|
||||
|
||||
# 生成媒体文件HTML
|
||||
media_section = ""
|
||||
if media_files:
|
||||
media_section = """
|
||||
<div class="media-section">
|
||||
<h3>媒体文件</h3>
|
||||
"""
|
||||
for media_file in media_files:
|
||||
if media_file.media_type == 'image':
|
||||
media_section += f"""
|
||||
<div class="media-item">
|
||||
<h4>图片: {media_file.alt_text or '无标题'}</h4>
|
||||
<img src="/media/{media_file.local_file.name}" alt="{media_file.alt_text}" style="max-width: 100%; height: auto;">
|
||||
<p><small>原始URL: <a href="{media_file.original_url}" target="_blank">{media_file.original_url}</a></small></p>
|
||||
<p><small>文件大小: {media_file.file_size_display}</small></p>
|
||||
</div>
|
||||
"""
|
||||
elif media_file.media_type == 'video':
|
||||
media_section += f"""
|
||||
<div class="media-item">
|
||||
<h4>视频</h4>
|
||||
<video controls style="max-width: 100%;">
|
||||
<source src="/media/{media_file.local_file.name}" type="{media_file.mime_type}">
|
||||
您的浏览器不支持视频播放。
|
||||
</video>
|
||||
<p><small>原始URL: <a href="{media_file.original_url}" target="_blank">{media_file.original_url}</a></small></p>
|
||||
<p><small>文件大小: {media_file.file_size_display}</small></p>
|
||||
</div>
|
||||
"""
|
||||
elif media_file.media_type == 'audio':
|
||||
media_section += f"""
|
||||
<div class="media-item">
|
||||
<h4>音频</h4>
|
||||
<audio controls>
|
||||
<source src="/media/{media_file.local_file.name}" type="{media_file.mime_type}">
|
||||
您的浏览器不支持音频播放。
|
||||
</audio>
|
||||
<p><small>原始URL: <a href="{media_file.original_url}" target="_blank">{media_file.original_url}</a></small></p>
|
||||
<p><small>文件大小: {media_file.file_size_display}</small></p>
|
||||
</div>
|
||||
"""
|
||||
else:
|
||||
media_section += f"""
|
||||
<div class="media-item">
|
||||
<h4>文件: {media_file.get_media_type_display()}</h4>
|
||||
<p><a href="/media/{media_file.local_file.name}" download>下载文件</a></p>
|
||||
<p><small>原始URL: <a href="{media_file.original_url}" target="_blank">{media_file.original_url}</a></small></p>
|
||||
<p><small>文件大小: {media_file.file_size_display}</small></p>
|
||||
</div>
|
||||
"""
|
||||
media_section += " </div>"
|
||||
|
||||
# 处理内容格式,将换行符转换为段落和<br>标签
|
||||
formatted_content = content.content.replace('\n\n', '</p><p>').replace('\n', '<br>')
|
||||
|
||||
# 动态生成预览页面
|
||||
html_content = f"""
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>{content.title}</title>
|
||||
<style>
|
||||
body {{
|
||||
font-family: Arial, sans-serif;
|
||||
margin: 40px;
|
||||
line-height: 1.6;
|
||||
max-width: 1200px;
|
||||
margin: 40px auto;
|
||||
}}
|
||||
h1 {{ color: #333; margin-bottom: 20px; }}
|
||||
.meta {{
|
||||
color: #666;
|
||||
margin-bottom: 30px;
|
||||
padding: 20px;
|
||||
background-color: #f8f9fa;
|
||||
border-radius: 8px;
|
||||
border-left: 4px solid #007bff;
|
||||
}}
|
||||
.content {{
|
||||
line-height: 1.8;
|
||||
font-size: 16px;
|
||||
margin-bottom: 30px;
|
||||
}}
|
||||
.content p {{
|
||||
margin-bottom: 1em;
|
||||
}}
|
||||
.media-section {{
|
||||
margin-top: 30px;
|
||||
padding: 20px;
|
||||
background-color: #f8f9fa;
|
||||
border-radius: 8px;
|
||||
}}
|
||||
.media-item {{
|
||||
margin-bottom: 20px;
|
||||
padding: 15px;
|
||||
border: 1px solid #ddd;
|
||||
border-radius: 5px;
|
||||
background-color: white;
|
||||
}}
|
||||
.media-item h4 {{
|
||||
margin-top: 0;
|
||||
color: #555;
|
||||
border-bottom: 1px solid #eee;
|
||||
padding-bottom: 10px;
|
||||
}}
|
||||
.back-link {{
|
||||
margin-bottom: 20px;
|
||||
}}
|
||||
.back-link a {{
|
||||
color: #007bff;
|
||||
text-decoration: none;
|
||||
font-weight: bold;
|
||||
padding: 8px 16px;
|
||||
background-color: #f8f9fa;
|
||||
border: 1px solid #ddd;
|
||||
border-radius: 4px;
|
||||
}}
|
||||
.back-link a:hover {{
|
||||
text-decoration: underline;
|
||||
background-color: #e9ecef;
|
||||
}}
|
||||
.navbar {{
|
||||
background-color: #007bff;
|
||||
padding: 15px;
|
||||
margin-bottom: 30px;
|
||||
border-radius: 8px;
|
||||
}}
|
||||
.navbar a {{
|
||||
color: white;
|
||||
text-decoration: none;
|
||||
margin-right: 20px;
|
||||
font-weight: bold;
|
||||
}}
|
||||
.navbar a:hover {{
|
||||
text-decoration: underline;
|
||||
}}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="navbar">
|
||||
<a href="/">仪表板</a>
|
||||
<a href="/admin/crawler/crawledcontent/">管理界面</a>
|
||||
<a href="javascript:history.back()">← 返回</a>
|
||||
</div>
|
||||
|
||||
<h1>{content.title}</h1>
|
||||
|
||||
<div class="meta">
|
||||
<p><strong>来源网站:</strong> {content.website.name} ({content.website.region})</p>
|
||||
<p><strong>原始链接:</strong> <a href="{content.url}" target="_blank">{content.url}</a></p>
|
||||
<p><strong>发布时间:</strong> {content.publish_date or '未知'}</p>
|
||||
<p><strong>作者:</strong> {content.author or '未知'}</p>
|
||||
<p><strong>匹配关键字:</strong> {content.keywords_matched}</p>
|
||||
<p><strong>爬取时间:</strong> {content.created_at}</p>
|
||||
<p><strong>媒体文件数量:</strong> {len(media_files)}</p>
|
||||
</div>
|
||||
|
||||
<div class="content">
|
||||
<p>{formatted_content}</p>
|
||||
</div>
|
||||
|
||||
{media_section}
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
return HttpResponse(html_content, content_type='text/html; charset=utf-8')
|
||||
Reference in New Issue
Block a user