Files
icac/crawler/models.py
2025-09-23 13:30:03 +08:00

195 lines
7.9 KiB
Python

from django.db import models
from django.utils import timezone
import os
def crawled_content_file_path(instance, filename):
"""生成爬取内容文件的存储路径"""
# 使用任务ID和时间戳创建唯一文件名
timestamp = timezone.now().strftime('%Y%m%d_%H%M%S')
name, ext = os.path.splitext(filename)
return f'crawled_content/{instance.task.id}/{timestamp}_{instance.id}{ext}'
def media_file_path(instance, filename):
"""生成媒体文件的存储路径"""
# 使用任务ID和内容ID创建媒体文件路径
timestamp = timezone.now().strftime('%Y%m%d_%H%M%S')
name, ext = os.path.splitext(filename)
return f'{instance.content.task.id}/{instance.content.id}/{timestamp}_{name}{ext}'
class Website(models.Model):
"""目标网站模型"""
name = models.CharField(max_length=100, verbose_name='网站名称')
url = models.URLField(verbose_name='网站地址')
region = models.CharField(max_length=50, verbose_name='所属地区')
is_active = models.BooleanField(default=True, verbose_name='是否启用')
created_at = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')
updated_at = models.DateTimeField(auto_now=True, verbose_name='更新时间')
class Meta:
verbose_name = '目标网站'
verbose_name_plural = '目标网站'
ordering = ['region', 'name']
def __str__(self):
return f"{self.region} - {self.name}"
class CrawlTask(models.Model):
"""爬取任务模型"""
TASK_STATUS_CHOICES = [
('pending', '待执行'),
('running', '执行中'),
('completed', '已完成'),
('failed', '执行失败'),
('cancelled', '已取消'),
]
name = models.CharField(max_length=200, verbose_name='任务名称')
keywords = models.TextField(verbose_name='搜索关键字', help_text='多个关键字用逗号分隔')
websites = models.ManyToManyField(Website, verbose_name='目标网站')
status = models.CharField(max_length=20, choices=TASK_STATUS_CHOICES, default='pending', verbose_name='任务状态')
created_by = models.CharField(max_length=100, verbose_name='创建者', default='system')
created_at = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')
started_at = models.DateTimeField(null=True, blank=True, verbose_name='开始时间')
completed_at = models.DateTimeField(null=True, blank=True, verbose_name='完成时间')
error_message = models.TextField(blank=True, verbose_name='错误信息')
total_pages = models.IntegerField(default=0, verbose_name='总页数')
crawled_pages = models.IntegerField(default=0, verbose_name='已爬取页数')
class Meta:
verbose_name = '爬取任务'
verbose_name_plural = '爬取任务'
ordering = ['-created_at']
def __str__(self):
return f"{self.name} - {self.get_status_display()}"
@property
def progress_percentage(self):
"""计算任务进度百分比"""
if self.total_pages == 0:
return 0
return round((self.crawled_pages / self.total_pages) * 100, 2)
class CrawledContent(models.Model):
"""爬取内容模型"""
task = models.ForeignKey(CrawlTask, on_delete=models.CASCADE, related_name='contents', verbose_name='所属任务')
website = models.ForeignKey(Website, on_delete=models.CASCADE, verbose_name='来源网站')
title = models.CharField(max_length=500, verbose_name='标题')
content = models.TextField(verbose_name='内容')
url = models.URLField(verbose_name='原文链接')
publish_date = models.DateTimeField(null=True, blank=True, verbose_name='发布时间')
author = models.CharField(max_length=100, blank=True, verbose_name='作者')
keywords_matched = models.TextField(verbose_name='匹配的关键字', help_text='匹配到的关键字,用逗号分隔')
created_at = models.DateTimeField(auto_now_add=True, verbose_name='爬取时间')
# 添加本地存储字段
local_file = models.FileField(upload_to=crawled_content_file_path, blank=True, null=True, verbose_name='本地文件')
is_local_saved = models.BooleanField(default=False, verbose_name='是否已本地保存')
class Meta:
verbose_name = '爬取内容'
verbose_name_plural = '爬取内容'
ordering = ['-created_at']
indexes = [
models.Index(fields=['task', 'website']),
models.Index(fields=['created_at']),
models.Index(fields=['publish_date']),
]
def __str__(self):
return f"{self.website.name} - {self.title[:50]}"
def get_preview_content(self, max_length=500):
"""获取预览内容"""
if len(self.content) <= max_length:
return self.content
return self.content[:max_length] + '...'
class MediaFile(models.Model):
"""媒体文件模型"""
MEDIA_TYPE_CHOICES = [
('image', '图片'),
('video', '视频'),
('audio', '音频'),
('document', '文档'),
]
content = models.ForeignKey(CrawledContent, on_delete=models.CASCADE, related_name='media_files', verbose_name='所属内容')
media_type = models.CharField(max_length=20, choices=MEDIA_TYPE_CHOICES, verbose_name='媒体类型')
original_url = models.URLField(verbose_name='原始URL')
local_file = models.FileField(upload_to=media_file_path, verbose_name='本地文件')
file_size = models.BigIntegerField(null=True, blank=True, verbose_name='文件大小(字节)')
mime_type = models.CharField(max_length=100, blank=True, verbose_name='MIME类型')
alt_text = models.CharField(max_length=500, blank=True, verbose_name='替代文本')
created_at = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')
class Meta:
verbose_name = '媒体文件'
verbose_name_plural = '媒体文件'
ordering = ['-created_at']
indexes = [
models.Index(fields=['content', 'media_type']),
models.Index(fields=['created_at']),
]
def __str__(self):
return f"{self.get_media_type_display()} - {self.original_url}"
@property
def file_size_display(self):
"""显示文件大小"""
if not self.file_size:
return "未知"
size = self.file_size
for unit in ['B', 'KB', 'MB', 'GB']:
if size < 1024.0:
return f"{size:.1f} {unit}"
size /= 1024.0
return f"{size:.1f} TB"
class CrawlLog(models.Model):
"""爬取日志模型"""
LOG_LEVEL_CHOICES = [
('info', '信息'),
('warning', '警告'),
('error', '错误'),
('debug', '调试'),
]
task = models.ForeignKey(CrawlTask, on_delete=models.CASCADE, related_name='logs', verbose_name='所属任务')
website = models.ForeignKey(Website, on_delete=models.CASCADE, null=True, blank=True, verbose_name='相关网站')
level = models.CharField(max_length=20, choices=LOG_LEVEL_CHOICES, verbose_name='日志级别')
message = models.TextField(verbose_name='日志消息')
created_at = models.DateTimeField(auto_now_add=True, verbose_name='记录时间')
class Meta:
verbose_name = '爬取日志'
verbose_name_plural = '爬取日志'
ordering = ['-created_at']
def __str__(self):
return f"[{self.get_level_display()}] {self.message[:100]}"
class SearchKeyword(models.Model):
"""搜索关键字模型"""
keyword = models.CharField(max_length=100, unique=True, verbose_name='关键字')
is_active = models.BooleanField(default=True, verbose_name='是否启用')
created_at = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')
last_used = models.DateTimeField(null=True, blank=True, verbose_name='最后使用时间')
class Meta:
verbose_name = '搜索关键字'
verbose_name_plural = '搜索关键字'
ordering = ['-last_used', '-created_at']
def __str__(self):
return self.keyword