195 lines
7.9 KiB
Python
195 lines
7.9 KiB
Python
from django.db import models
|
|
from django.utils import timezone
|
|
import os
|
|
|
|
|
|
def crawled_content_file_path(instance, filename):
|
|
"""生成爬取内容文件的存储路径"""
|
|
# 使用任务ID和时间戳创建唯一文件名
|
|
timestamp = timezone.now().strftime('%Y%m%d_%H%M%S')
|
|
name, ext = os.path.splitext(filename)
|
|
return f'crawled_content/{instance.task.id}/{timestamp}_{instance.id}{ext}'
|
|
|
|
|
|
def media_file_path(instance, filename):
|
|
"""生成媒体文件的存储路径"""
|
|
# 使用任务ID和内容ID创建媒体文件路径
|
|
timestamp = timezone.now().strftime('%Y%m%d_%H%M%S')
|
|
name, ext = os.path.splitext(filename)
|
|
return f'{instance.content.task.id}/{instance.content.id}/{timestamp}_{name}{ext}'
|
|
|
|
|
|
class Website(models.Model):
|
|
"""目标网站模型"""
|
|
name = models.CharField(max_length=100, verbose_name='网站名称')
|
|
url = models.URLField(verbose_name='网站地址')
|
|
region = models.CharField(max_length=50, verbose_name='所属地区')
|
|
is_active = models.BooleanField(default=True, verbose_name='是否启用')
|
|
created_at = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')
|
|
updated_at = models.DateTimeField(auto_now=True, verbose_name='更新时间')
|
|
|
|
class Meta:
|
|
verbose_name = '目标网站'
|
|
verbose_name_plural = '目标网站'
|
|
ordering = ['region', 'name']
|
|
|
|
def __str__(self):
|
|
return f"{self.region} - {self.name}"
|
|
|
|
|
|
class CrawlTask(models.Model):
|
|
"""爬取任务模型"""
|
|
TASK_STATUS_CHOICES = [
|
|
('pending', '待执行'),
|
|
('running', '执行中'),
|
|
('completed', '已完成'),
|
|
('failed', '执行失败'),
|
|
('cancelled', '已取消'),
|
|
]
|
|
|
|
name = models.CharField(max_length=200, verbose_name='任务名称')
|
|
keywords = models.TextField(verbose_name='搜索关键字', help_text='多个关键字用逗号分隔')
|
|
websites = models.ManyToManyField(Website, verbose_name='目标网站')
|
|
status = models.CharField(max_length=20, choices=TASK_STATUS_CHOICES, default='pending', verbose_name='任务状态')
|
|
created_by = models.CharField(max_length=100, verbose_name='创建者', default='system')
|
|
created_at = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')
|
|
started_at = models.DateTimeField(null=True, blank=True, verbose_name='开始时间')
|
|
completed_at = models.DateTimeField(null=True, blank=True, verbose_name='完成时间')
|
|
error_message = models.TextField(blank=True, verbose_name='错误信息')
|
|
total_pages = models.IntegerField(default=0, verbose_name='总页数')
|
|
crawled_pages = models.IntegerField(default=0, verbose_name='已爬取页数')
|
|
|
|
class Meta:
|
|
verbose_name = '爬取任务'
|
|
verbose_name_plural = '爬取任务'
|
|
ordering = ['-created_at']
|
|
|
|
def __str__(self):
|
|
return f"{self.name} - {self.get_status_display()}"
|
|
|
|
@property
|
|
def progress_percentage(self):
|
|
"""计算任务进度百分比"""
|
|
if self.total_pages == 0:
|
|
return 0
|
|
return round((self.crawled_pages / self.total_pages) * 100, 2)
|
|
|
|
|
|
class CrawledContent(models.Model):
|
|
"""爬取内容模型"""
|
|
task = models.ForeignKey(CrawlTask, on_delete=models.CASCADE, related_name='contents', verbose_name='所属任务')
|
|
website = models.ForeignKey(Website, on_delete=models.CASCADE, verbose_name='来源网站')
|
|
title = models.CharField(max_length=500, verbose_name='标题')
|
|
content = models.TextField(verbose_name='内容')
|
|
url = models.URLField(verbose_name='原文链接')
|
|
publish_date = models.DateTimeField(null=True, blank=True, verbose_name='发布时间')
|
|
author = models.CharField(max_length=100, blank=True, verbose_name='作者')
|
|
keywords_matched = models.TextField(verbose_name='匹配的关键字', help_text='匹配到的关键字,用逗号分隔')
|
|
created_at = models.DateTimeField(auto_now_add=True, verbose_name='爬取时间')
|
|
|
|
# 添加本地存储字段
|
|
local_file = models.FileField(upload_to=crawled_content_file_path, blank=True, null=True, verbose_name='本地文件')
|
|
is_local_saved = models.BooleanField(default=False, verbose_name='是否已本地保存')
|
|
|
|
class Meta:
|
|
verbose_name = '爬取内容'
|
|
verbose_name_plural = '爬取内容'
|
|
ordering = ['-created_at']
|
|
indexes = [
|
|
models.Index(fields=['task', 'website']),
|
|
models.Index(fields=['created_at']),
|
|
models.Index(fields=['publish_date']),
|
|
]
|
|
|
|
def __str__(self):
|
|
return f"{self.website.name} - {self.title[:50]}"
|
|
|
|
def get_preview_content(self, max_length=500):
|
|
"""获取预览内容"""
|
|
if len(self.content) <= max_length:
|
|
return self.content
|
|
return self.content[:max_length] + '...'
|
|
|
|
|
|
class MediaFile(models.Model):
|
|
"""媒体文件模型"""
|
|
MEDIA_TYPE_CHOICES = [
|
|
('image', '图片'),
|
|
('video', '视频'),
|
|
('audio', '音频'),
|
|
('document', '文档'),
|
|
]
|
|
|
|
content = models.ForeignKey(CrawledContent, on_delete=models.CASCADE, related_name='media_files', verbose_name='所属内容')
|
|
media_type = models.CharField(max_length=20, choices=MEDIA_TYPE_CHOICES, verbose_name='媒体类型')
|
|
original_url = models.URLField(verbose_name='原始URL')
|
|
local_file = models.FileField(upload_to=media_file_path, verbose_name='本地文件')
|
|
file_size = models.BigIntegerField(null=True, blank=True, verbose_name='文件大小(字节)')
|
|
mime_type = models.CharField(max_length=100, blank=True, verbose_name='MIME类型')
|
|
alt_text = models.CharField(max_length=500, blank=True, verbose_name='替代文本')
|
|
created_at = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')
|
|
|
|
class Meta:
|
|
verbose_name = '媒体文件'
|
|
verbose_name_plural = '媒体文件'
|
|
ordering = ['-created_at']
|
|
indexes = [
|
|
models.Index(fields=['content', 'media_type']),
|
|
models.Index(fields=['created_at']),
|
|
]
|
|
|
|
def __str__(self):
|
|
return f"{self.get_media_type_display()} - {self.original_url}"
|
|
|
|
@property
|
|
def file_size_display(self):
|
|
"""显示文件大小"""
|
|
if not self.file_size:
|
|
return "未知"
|
|
|
|
size = self.file_size
|
|
for unit in ['B', 'KB', 'MB', 'GB']:
|
|
if size < 1024.0:
|
|
return f"{size:.1f} {unit}"
|
|
size /= 1024.0
|
|
return f"{size:.1f} TB"
|
|
|
|
|
|
class CrawlLog(models.Model):
|
|
"""爬取日志模型"""
|
|
LOG_LEVEL_CHOICES = [
|
|
('info', '信息'),
|
|
('warning', '警告'),
|
|
('error', '错误'),
|
|
('debug', '调试'),
|
|
]
|
|
|
|
task = models.ForeignKey(CrawlTask, on_delete=models.CASCADE, related_name='logs', verbose_name='所属任务')
|
|
website = models.ForeignKey(Website, on_delete=models.CASCADE, null=True, blank=True, verbose_name='相关网站')
|
|
level = models.CharField(max_length=20, choices=LOG_LEVEL_CHOICES, verbose_name='日志级别')
|
|
message = models.TextField(verbose_name='日志消息')
|
|
created_at = models.DateTimeField(auto_now_add=True, verbose_name='记录时间')
|
|
|
|
class Meta:
|
|
verbose_name = '爬取日志'
|
|
verbose_name_plural = '爬取日志'
|
|
ordering = ['-created_at']
|
|
|
|
def __str__(self):
|
|
return f"[{self.get_level_display()}] {self.message[:100]}"
|
|
|
|
|
|
class SearchKeyword(models.Model):
|
|
"""搜索关键字模型"""
|
|
keyword = models.CharField(max_length=100, unique=True, verbose_name='关键字')
|
|
is_active = models.BooleanField(default=True, verbose_name='是否启用')
|
|
created_at = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')
|
|
last_used = models.DateTimeField(null=True, blank=True, verbose_name='最后使用时间')
|
|
|
|
class Meta:
|
|
verbose_name = '搜索关键字'
|
|
verbose_name_plural = '搜索关键字'
|
|
ordering = ['-last_used', '-created_at']
|
|
|
|
def __str__(self):
|
|
return self.keyword |