{formatted_content}
from django.shortcuts import render, get_object_or_404 from django.http import HttpResponse, Http404 from django.db.models import Q, Count from django.conf import settings from django.utils import timezone from django.core.files.storage import default_storage from .models import Website, CrawlTask, CrawledContent, CrawlLog, SearchKeyword, MediaFile from rest_framework import viewsets, filters from rest_framework.decorators import action from rest_framework.response import Response from .serializers import ( WebsiteSerializer, CrawlTaskSerializer, CrawledContentSerializer, CrawlLogSerializer, SearchKeywordSerializer ) import json from django.core.paginator import Paginator from django.db.models.functions import TruncDate from django.db.models import Count import os import tempfile import zipfile from io import BytesIO from docx import Document from django.core.files.base import ContentFile def dashboard(request): """仪表板视图""" # 统计数据 total_websites = Website.objects.filter(is_active=True).count() total_tasks = CrawlTask.objects.count() total_contents = CrawledContent.objects.count() active_tasks = CrawlTask.objects.filter(status='running').count() # 获取所有网站 websites = Website.objects.filter(is_active=True).order_by('name') # 获取当前选中的网站ID selected_website_id = request.GET.get('website') # 获取分页参数 page_number = request.GET.get('page', 1) page_size = request.GET.get('page_size', 20) # 默认每页20篇文章 # 尝试转换page_size为整数 try: page_size = int(page_size) # 限制page_size在合理范围内 page_size = max(10, min(100, page_size)) except (ValueError, TypeError): page_size = 20 # 获取所有爬取的内容,按网站和创建时间排序 all_contents = CrawledContent.objects.select_related('website').order_by('website__name', '-created_at') # 如果选择了特定网站,则进行过滤 if selected_website_id: try: selected_website_id = int(selected_website_id) all_contents = all_contents.filter(website_id=selected_website_id) except (ValueError, TypeError): pass # 分页处理 paginator = Paginator(all_contents, page_size) page_obj = paginator.get_page(page_number) # 按网站分组内容 contents_by_website = {} for content in page_obj: website_name = content.website.name if website_name not in contents_by_website: contents_by_website[website_name] = [] contents_by_website[website_name].append(content) # 最近的任务 recent_tasks = CrawlTask.objects.order_by('-created_at')[:5] # 媒体文件统计 total_media_files = CrawledContent.objects.aggregate( total_media=Count('media_files') )['total_media'] or 0 stats = { 'total_websites': total_websites, 'total_tasks': total_tasks, 'total_contents': total_contents, 'active_tasks': active_tasks, 'websites': websites, 'selected_website_id': selected_website_id, 'page_obj': page_obj, 'contents_by_website': contents_by_website, 'page_size': page_size, 'recent_tasks': recent_tasks, 'total_media_files': total_media_files, } return render(request, 'crawler/dashboard.html', {'stats': stats}) def search_page(request): """搜索页面视图""" keyword = request.GET.get('q', '').strip() contents = [] if keyword: # 记录搜索关键字 SearchKeyword.objects.get_or_create( keyword=keyword, defaults={'last_used': timezone.now()} ) # 搜索内容 contents = CrawledContent.objects.filter( Q(title__icontains=keyword) | Q(content__icontains=keyword) | Q(keywords_matched__icontains=keyword) ).order_by('-created_at')[:50] return render(request, 'crawler/search.html', { 'keyword': keyword, 'contents': contents }) def preview_crawled_content(request, content_id): """预览爬取的内容""" content = get_object_or_404(CrawledContent, id=content_id) # 获取媒体文件 media_files = content.media_files.all() # 生成媒体文件HTML media_section = "" if media_files: media_section = """
原始URL: {media_file.original_url}
文件大小: {media_file.file_size_display}
原始URL: {media_file.original_url}
文件大小: {media_file.file_size_display}
').replace('\n', '
')
# 动态生成预览页面
html_content = f"""
{formatted_content}