Base setup

2025-09-23 13:30:03 +08:00
parent 1057ed8690
commit e51154bb29
34 changed files with 2574 additions and 1 deletions
--- a/crawler/views.py
+++ b/crawler/views.py
@@ -0,0 +1,292 @@
+from django.shortcuts import render, get_object_or_404
+from django.http import HttpResponse
+from django.db.models import Q, Count
+from django.conf import settings
+from django.utils import timezone
+from .models import Website, CrawlTask, CrawledContent, CrawlLog, SearchKeyword
+from rest_framework import viewsets, filters
+from rest_framework.decorators import action
+from rest_framework.response import Response
+from .serializers import (
+    WebsiteSerializer, CrawlTaskSerializer, CrawledContentSerializer, 
+    CrawlLogSerializer, SearchKeywordSerializer
+)
+import json
+from django.core.paginator import Paginator
+from django.db.models.functions import TruncDate
+from django.db.models import Count
+
+
+def dashboard(request):
+    """仪表板视图"""
+    # 统计数据
+    total_websites = Website.objects.filter(is_active=True).count()
+    total_tasks = CrawlTask.objects.count()
+    total_contents = CrawledContent.objects.count()
+    active_tasks = CrawlTask.objects.filter(status='running').count()
+    
+    # 获取所有网站
+    websites = Website.objects.filter(is_active=True).order_by('name')
+    
+    # 获取当前选中的网站ID
+    selected_website_id = request.GET.get('website')
+    
+    # 获取分页参数
+    page_number = request.GET.get('page', 1)
+    page_size = request.GET.get('page_size', 20)  # 默认每页20篇文章
+    
+    # 尝试转换page_size为整数
+    try:
+        page_size = int(page_size)
+        # 限制page_size在合理范围内
+        page_size = max(10, min(100, page_size))
+    except (ValueError, TypeError):
+        page_size = 20
+    
+    # 获取所有爬取的内容，按网站和创建时间排序
+    all_contents = CrawledContent.objects.select_related('website').order_by('website__name', '-created_at')
+    
+    # 如果选择了特定网站，则进行过滤
+    if selected_website_id:
+        try:
+            selected_website_id = int(selected_website_id)
+            all_contents = all_contents.filter(website_id=selected_website_id)
+        except (ValueError, TypeError):
+            pass
+    
+    # 分页处理
+    paginator = Paginator(all_contents, page_size)
+    page_obj = paginator.get_page(page_number)
+    
+    # 按网站分组内容
+    contents_by_website = {}
+    for content in page_obj:
+        website_name = content.website.name
+        if website_name not in contents_by_website:
+            contents_by_website[website_name] = []
+        contents_by_website[website_name].append(content)
+    
+    # 最近的任务
+    recent_tasks = CrawlTask.objects.order_by('-created_at')[:5]
+    
+    # 媒体文件统计
+    total_media_files = CrawledContent.objects.aggregate(
+        total_media=Count('media_files')
+    )['total_media'] or 0
+    
+    stats = {
+        'total_websites': total_websites,
+        'total_tasks': total_tasks,
+        'total_contents': total_contents,
+        'active_tasks': active_tasks,
+        'websites': websites,
+        'selected_website_id': selected_website_id,
+        'page_obj': page_obj,
+        'contents_by_website': contents_by_website,
+        'page_size': page_size,
+        'recent_tasks': recent_tasks,
+        'total_media_files': total_media_files,
+    }
+    
+    return render(request, 'crawler/dashboard.html', {'stats': stats})
+
+
+def search_page(request):
+    """搜索页面视图"""
+    keyword = request.GET.get('q', '').strip()
+    contents = []
+    
+    if keyword:
+        # 记录搜索关键字
+        SearchKeyword.objects.get_or_create(
+            keyword=keyword,
+            defaults={'last_used': timezone.now()}
+        )
+        
+        # 搜索内容
+        contents = CrawledContent.objects.filter(
+            Q(title__icontains=keyword) |
+            Q(content__icontains=keyword) |
+            Q(keywords_matched__icontains=keyword)
+        ).order_by('-created_at')[:50]
+    
+    return render(request, 'crawler/search.html', {
+        'keyword': keyword,
+        'contents': contents
+    })
+
+
+def preview_crawled_content(request, content_id):
+    """预览爬取的内容"""
+    content = get_object_or_404(CrawledContent, id=content_id)
+    
+    # 获取媒体文件
+    media_files = content.media_files.all()
+    
+    # 生成媒体文件HTML
+    media_section = ""
+    if media_files:
+        media_section = """
+    <div class="media-section">
+        <h3>媒体文件</h3>
+"""
+        for media_file in media_files:
+            if media_file.media_type == 'image':
+                media_section += f"""
+        <div class="media-item">
+            <h4>图片: {media_file.alt_text or '无标题'}</h4>
+            <img src="/media/{media_file.local_file.name}" alt="{media_file.alt_text}" style="max-width: 100%; height: auto;">
+            <p><small>原始URL: <a href="{media_file.original_url}" target="_blank">{media_file.original_url}</a></small></p>
+            <p><small>文件大小: {media_file.file_size_display}</small></p>
+        </div>
+"""
+            elif media_file.media_type == 'video':
+                media_section += f"""
+        <div class="media-item">
+            <h4>视频</h4>
+            <video controls style="max-width: 100%;">
+                <source src="/media/{media_file.local_file.name}" type="{media_file.mime_type}">
+                您的浏览器不支持视频播放。
+            </video>
+            <p><small>原始URL: <a href="{media_file.original_url}" target="_blank">{media_file.original_url}</a></small></p>
+            <p><small>文件大小: {media_file.file_size_display}</small></p>
+        </div>
+"""
+            elif media_file.media_type == 'audio':
+                media_section += f"""
+        <div class="media-item">
+            <h4>音频</h4>
+            <audio controls>
+                <source src="/media/{media_file.local_file.name}" type="{media_file.mime_type}">
+                您的浏览器不支持音频播放。
+            </audio>
+            <p><small>原始URL: <a href="{media_file.original_url}" target="_blank">{media_file.original_url}</a></small></p>
+            <p><small>文件大小: {media_file.file_size_display}</small></p>
+        </div>
+"""
+            else:
+                media_section += f"""
+        <div class="media-item">
+            <h4>文件: {media_file.get_media_type_display()}</h4>
+            <p><a href="/media/{media_file.local_file.name}" download>下载文件</a></p>
+            <p><small>原始URL: <a href="{media_file.original_url}" target="_blank">{media_file.original_url}</a></small></p>
+            <p><small>文件大小: {media_file.file_size_display}</small></p>
+        </div>
+"""
+        media_section += "    </div>"
+    
+    # 处理内容格式，将换行符转换为段落和<br>标签
+    formatted_content = content.content.replace('\n\n', '</p><p>').replace('\n', '<br>')
+    
+    # 动态生成预览页面
+    html_content = f"""
+<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="utf-8">
+    <title>{content.title}</title>
+    <style>
+        body {{ 
+            font-family: Arial, sans-serif; 
+            margin: 40px; 
+            line-height: 1.6;
+            max-width: 1200px;
+            margin: 40px auto;
+        }}
+        h1 {{ color: #333; margin-bottom: 20px; }}
+        .meta {{ 
+            color: #666; 
+            margin-bottom: 30px; 
+            padding: 20px;
+            background-color: #f8f9fa;
+            border-radius: 8px;
+            border-left: 4px solid #007bff;
+        }}
+        .content {{ 
+            line-height: 1.8; 
+            font-size: 16px;
+            margin-bottom: 30px;
+        }}
+        .content p {{
+            margin-bottom: 1em;
+        }}
+        .media-section {{ 
+            margin-top: 30px; 
+            padding: 20px;
+            background-color: #f8f9fa;
+            border-radius: 8px;
+        }}
+        .media-item {{ 
+            margin-bottom: 20px; 
+            padding: 15px; 
+            border: 1px solid #ddd; 
+            border-radius: 5px; 
+            background-color: white;
+        }}
+        .media-item h4 {{ 
+            margin-top: 0; 
+            color: #555; 
+            border-bottom: 1px solid #eee;
+            padding-bottom: 10px;
+        }}
+        .back-link {{
+            margin-bottom: 20px;
+        }}
+        .back-link a {{
+            color: #007bff;
+            text-decoration: none;
+            font-weight: bold;
+            padding: 8px 16px;
+            background-color: #f8f9fa;
+            border: 1px solid #ddd;
+            border-radius: 4px;
+        }}
+        .back-link a:hover {{
+            text-decoration: underline;
+            background-color: #e9ecef;
+        }}
+        .navbar {{
+            background-color: #007bff;
+            padding: 15px;
+            margin-bottom: 30px;
+            border-radius: 8px;
+        }}
+        .navbar a {{
+            color: white;
+            text-decoration: none;
+            margin-right: 20px;
+            font-weight: bold;
+        }}
+        .navbar a:hover {{
+            text-decoration: underline;
+        }}
+    </style>
+</head>
+<body>
+    <div class="navbar">
+        <a href="/">仪表板</a>
+        <a href="/admin/crawler/crawledcontent/">管理界面</a>
+        <a href="javascript:history.back()">← 返回</a>
+    </div>
+    
+    <h1>{content.title}</h1>
+    
+    <div class="meta">
+        <p><strong>来源网站:</strong> {content.website.name} ({content.website.region})</p>
+        <p><strong>原始链接:</strong> <a href="{content.url}" target="_blank">{content.url}</a></p>
+        <p><strong>发布时间:</strong> {content.publish_date or '未知'}</p>
+        <p><strong>作者:</strong> {content.author or '未知'}</p>
+        <p><strong>匹配关键字:</strong> {content.keywords_matched}</p>
+        <p><strong>爬取时间:</strong> {content.created_at}</p>
+        <p><strong>媒体文件数量:</strong> {len(media_files)}</p>
+    </div>
+    
+    <div class="content">
+        <p>{formatted_content}</p>
+    </div>
+    
+{media_section}
+</body>
+</html>
+    """
+    return HttpResponse(html_content, content_type='text/html; charset=utf-8')