{formatted_content}
from django.shortcuts import render, get_object_or_404 from django.http import HttpResponse from django.db.models import Q, Count from django.conf import settings from django.utils import timezone from .models import Website, CrawlTask, CrawledContent, CrawlLog, SearchKeyword from rest_framework import viewsets, filters from rest_framework.decorators import action from rest_framework.response import Response from .serializers import ( WebsiteSerializer, CrawlTaskSerializer, CrawledContentSerializer, CrawlLogSerializer, SearchKeywordSerializer ) import json from django.core.paginator import Paginator from django.db.models.functions import TruncDate from django.db.models import Count def dashboard(request): """仪表板视图""" # 统计数据 total_websites = Website.objects.filter(is_active=True).count() total_tasks = CrawlTask.objects.count() total_contents = CrawledContent.objects.count() active_tasks = CrawlTask.objects.filter(status='running').count() # 获取所有网站 websites = Website.objects.filter(is_active=True).order_by('name') # 获取当前选中的网站ID selected_website_id = request.GET.get('website') # 获取分页参数 page_number = request.GET.get('page', 1) page_size = request.GET.get('page_size', 20) # 默认每页20篇文章 # 尝试转换page_size为整数 try: page_size = int(page_size) # 限制page_size在合理范围内 page_size = max(10, min(100, page_size)) except (ValueError, TypeError): page_size = 20 # 获取所有爬取的内容,按网站和创建时间排序 all_contents = CrawledContent.objects.select_related('website').order_by('website__name', '-created_at') # 如果选择了特定网站,则进行过滤 if selected_website_id: try: selected_website_id = int(selected_website_id) all_contents = all_contents.filter(website_id=selected_website_id) except (ValueError, TypeError): pass # 分页处理 paginator = Paginator(all_contents, page_size) page_obj = paginator.get_page(page_number) # 按网站分组内容 contents_by_website = {} for content in page_obj: website_name = content.website.name if website_name not in contents_by_website: contents_by_website[website_name] = [] contents_by_website[website_name].append(content) # 最近的任务 recent_tasks = CrawlTask.objects.order_by('-created_at')[:5] # 媒体文件统计 total_media_files = CrawledContent.objects.aggregate( total_media=Count('media_files') )['total_media'] or 0 stats = { 'total_websites': total_websites, 'total_tasks': total_tasks, 'total_contents': total_contents, 'active_tasks': active_tasks, 'websites': websites, 'selected_website_id': selected_website_id, 'page_obj': page_obj, 'contents_by_website': contents_by_website, 'page_size': page_size, 'recent_tasks': recent_tasks, 'total_media_files': total_media_files, } return render(request, 'crawler/dashboard.html', {'stats': stats}) def search_page(request): """搜索页面视图""" keyword = request.GET.get('q', '').strip() contents = [] if keyword: # 记录搜索关键字 SearchKeyword.objects.get_or_create( keyword=keyword, defaults={'last_used': timezone.now()} ) # 搜索内容 contents = CrawledContent.objects.filter( Q(title__icontains=keyword) | Q(content__icontains=keyword) | Q(keywords_matched__icontains=keyword) ).order_by('-created_at')[:50] return render(request, 'crawler/search.html', { 'keyword': keyword, 'contents': contents }) def preview_crawled_content(request, content_id): """预览爬取的内容""" content = get_object_or_404(CrawledContent, id=content_id) # 获取媒体文件 media_files = content.media_files.all() # 生成媒体文件HTML media_section = "" if media_files: media_section = """
原始URL: {media_file.original_url}
文件大小: {media_file.file_size_display}
原始URL: {media_file.original_url}
文件大小: {media_file.file_size_display}
').replace('\n', '
')
# 动态生成预览页面
html_content = f"""
{formatted_content}