from django.db import models from django.utils import timezone import os def crawled_content_file_path(instance, filename): """生成爬取内容文件的存储路径""" # 使用任务ID和时间戳创建唯一文件名 timestamp = timezone.now().strftime('%Y%m%d_%H%M%S') name, ext = os.path.splitext(filename) return f'crawled_content/{instance.task.id}/{timestamp}_{instance.id}{ext}' def media_file_path(instance, filename): """生成媒体文件的存储路径""" # 使用任务ID和内容ID创建媒体文件路径 timestamp = timezone.now().strftime('%Y%m%d_%H%M%S') name, ext = os.path.splitext(filename) return f'{instance.content.task.id}/{instance.content.id}/{timestamp}_{name}{ext}' class Website(models.Model): """目标网站模型""" name = models.CharField(max_length=100, verbose_name='网站名称') url = models.URLField(verbose_name='网站地址') region = models.CharField(max_length=50, verbose_name='所属地区') is_active = models.BooleanField(default=True, verbose_name='是否启用') created_at = models.DateTimeField(auto_now_add=True, verbose_name='创建时间') updated_at = models.DateTimeField(auto_now=True, verbose_name='更新时间') class Meta: verbose_name = '目标网站' verbose_name_plural = '目标网站' ordering = ['region', 'name'] def __str__(self): return f"{self.region} - {self.name}" class CrawlTask(models.Model): """爬取任务模型""" TASK_STATUS_CHOICES = [ ('pending', '待执行'), ('running', '执行中'), ('completed', '已完成'), ('failed', '执行失败'), ('cancelled', '已取消'), ] name = models.CharField(max_length=200, verbose_name='任务名称') keywords = models.TextField(verbose_name='搜索关键字', help_text='多个关键字用逗号分隔') websites = models.ManyToManyField(Website, verbose_name='目标网站') status = models.CharField(max_length=20, choices=TASK_STATUS_CHOICES, default='pending', verbose_name='任务状态') created_by = models.CharField(max_length=100, verbose_name='创建者', default='system') created_at = models.DateTimeField(auto_now_add=True, verbose_name='创建时间') started_at = models.DateTimeField(null=True, blank=True, verbose_name='开始时间') completed_at = models.DateTimeField(null=True, blank=True, verbose_name='完成时间') error_message = models.TextField(blank=True, verbose_name='错误信息') total_pages = models.IntegerField(default=0, verbose_name='总页数') crawled_pages = models.IntegerField(default=0, verbose_name='已爬取页数') class Meta: verbose_name = '爬取任务' verbose_name_plural = '爬取任务' ordering = ['-created_at'] def __str__(self): return f"{self.name} - {self.get_status_display()}" @property def progress_percentage(self): """计算任务进度百分比""" if self.total_pages == 0: return 0 return round((self.crawled_pages / self.total_pages) * 100, 2) class CrawledContent(models.Model): """爬取内容模型""" task = models.ForeignKey(CrawlTask, on_delete=models.CASCADE, related_name='contents', verbose_name='所属任务') website = models.ForeignKey(Website, on_delete=models.CASCADE, verbose_name='来源网站') title = models.CharField(max_length=500, verbose_name='标题') content = models.TextField(verbose_name='内容') url = models.URLField(verbose_name='原文链接') publish_date = models.DateTimeField(null=True, blank=True, verbose_name='发布时间') author = models.CharField(max_length=100, blank=True, verbose_name='作者') keywords_matched = models.TextField(verbose_name='匹配的关键字', help_text='匹配到的关键字,用逗号分隔') created_at = models.DateTimeField(auto_now_add=True, verbose_name='爬取时间') # 添加本地存储字段 local_file = models.FileField(upload_to=crawled_content_file_path, blank=True, null=True, verbose_name='本地文件') is_local_saved = models.BooleanField(default=False, verbose_name='是否已本地保存') class Meta: verbose_name = '爬取内容' verbose_name_plural = '爬取内容' ordering = ['-created_at'] indexes = [ models.Index(fields=['task', 'website']), models.Index(fields=['created_at']), models.Index(fields=['publish_date']), ] def __str__(self): return f"{self.website.name} - {self.title[:50]}" def get_preview_content(self, max_length=500): """获取预览内容""" if len(self.content) <= max_length: return self.content return self.content[:max_length] + '...' class MediaFile(models.Model): """媒体文件模型""" MEDIA_TYPE_CHOICES = [ ('image', '图片'), ('video', '视频'), ('audio', '音频'), ('document', '文档'), ] content = models.ForeignKey(CrawledContent, on_delete=models.CASCADE, related_name='media_files', verbose_name='所属内容') media_type = models.CharField(max_length=20, choices=MEDIA_TYPE_CHOICES, verbose_name='媒体类型') original_url = models.URLField(verbose_name='原始URL') local_file = models.FileField(upload_to=media_file_path, verbose_name='本地文件') file_size = models.BigIntegerField(null=True, blank=True, verbose_name='文件大小(字节)') mime_type = models.CharField(max_length=100, blank=True, verbose_name='MIME类型') alt_text = models.CharField(max_length=500, blank=True, verbose_name='替代文本') created_at = models.DateTimeField(auto_now_add=True, verbose_name='创建时间') class Meta: verbose_name = '媒体文件' verbose_name_plural = '媒体文件' ordering = ['-created_at'] indexes = [ models.Index(fields=['content', 'media_type']), models.Index(fields=['created_at']), ] def __str__(self): return f"{self.get_media_type_display()} - {self.original_url}" @property def file_size_display(self): """显示文件大小""" if not self.file_size: return "未知" size = self.file_size for unit in ['B', 'KB', 'MB', 'GB']: if size < 1024.0: return f"{size:.1f} {unit}" size /= 1024.0 return f"{size:.1f} TB" class CrawlLog(models.Model): """爬取日志模型""" LOG_LEVEL_CHOICES = [ ('info', '信息'), ('warning', '警告'), ('error', '错误'), ('debug', '调试'), ] task = models.ForeignKey(CrawlTask, on_delete=models.CASCADE, related_name='logs', verbose_name='所属任务') website = models.ForeignKey(Website, on_delete=models.CASCADE, null=True, blank=True, verbose_name='相关网站') level = models.CharField(max_length=20, choices=LOG_LEVEL_CHOICES, verbose_name='日志级别') message = models.TextField(verbose_name='日志消息') created_at = models.DateTimeField(auto_now_add=True, verbose_name='记录时间') class Meta: verbose_name = '爬取日志' verbose_name_plural = '爬取日志' ordering = ['-created_at'] def __str__(self): return f"[{self.get_level_display()}] {self.message[:100]}" class SearchKeyword(models.Model): """搜索关键字模型""" keyword = models.CharField(max_length=100, unique=True, verbose_name='关键字') is_active = models.BooleanField(default=True, verbose_name='是否启用') created_at = models.DateTimeField(auto_now_add=True, verbose_name='创建时间') last_used = models.DateTimeField(null=True, blank=True, verbose_name='最后使用时间') class Meta: verbose_name = '搜索关键字' verbose_name_plural = '搜索关键字' ordering = ['-last_used', '-created_at'] def __str__(self): return self.keyword