Base setup
This commit is contained in:
195
crawler/models.py
Normal file
195
crawler/models.py
Normal file
@@ -0,0 +1,195 @@
|
||||
from django.db import models
|
||||
from django.utils import timezone
|
||||
import os
|
||||
|
||||
|
||||
def crawled_content_file_path(instance, filename):
|
||||
"""生成爬取内容文件的存储路径"""
|
||||
# 使用任务ID和时间戳创建唯一文件名
|
||||
timestamp = timezone.now().strftime('%Y%m%d_%H%M%S')
|
||||
name, ext = os.path.splitext(filename)
|
||||
return f'crawled_content/{instance.task.id}/{timestamp}_{instance.id}{ext}'
|
||||
|
||||
|
||||
def media_file_path(instance, filename):
|
||||
"""生成媒体文件的存储路径"""
|
||||
# 使用任务ID和内容ID创建媒体文件路径
|
||||
timestamp = timezone.now().strftime('%Y%m%d_%H%M%S')
|
||||
name, ext = os.path.splitext(filename)
|
||||
return f'{instance.content.task.id}/{instance.content.id}/{timestamp}_{name}{ext}'
|
||||
|
||||
|
||||
class Website(models.Model):
|
||||
"""目标网站模型"""
|
||||
name = models.CharField(max_length=100, verbose_name='网站名称')
|
||||
url = models.URLField(verbose_name='网站地址')
|
||||
region = models.CharField(max_length=50, verbose_name='所属地区')
|
||||
is_active = models.BooleanField(default=True, verbose_name='是否启用')
|
||||
created_at = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')
|
||||
updated_at = models.DateTimeField(auto_now=True, verbose_name='更新时间')
|
||||
|
||||
class Meta:
|
||||
verbose_name = '目标网站'
|
||||
verbose_name_plural = '目标网站'
|
||||
ordering = ['region', 'name']
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.region} - {self.name}"
|
||||
|
||||
|
||||
class CrawlTask(models.Model):
|
||||
"""爬取任务模型"""
|
||||
TASK_STATUS_CHOICES = [
|
||||
('pending', '待执行'),
|
||||
('running', '执行中'),
|
||||
('completed', '已完成'),
|
||||
('failed', '执行失败'),
|
||||
('cancelled', '已取消'),
|
||||
]
|
||||
|
||||
name = models.CharField(max_length=200, verbose_name='任务名称')
|
||||
keywords = models.TextField(verbose_name='搜索关键字', help_text='多个关键字用逗号分隔')
|
||||
websites = models.ManyToManyField(Website, verbose_name='目标网站')
|
||||
status = models.CharField(max_length=20, choices=TASK_STATUS_CHOICES, default='pending', verbose_name='任务状态')
|
||||
created_by = models.CharField(max_length=100, verbose_name='创建者', default='system')
|
||||
created_at = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')
|
||||
started_at = models.DateTimeField(null=True, blank=True, verbose_name='开始时间')
|
||||
completed_at = models.DateTimeField(null=True, blank=True, verbose_name='完成时间')
|
||||
error_message = models.TextField(blank=True, verbose_name='错误信息')
|
||||
total_pages = models.IntegerField(default=0, verbose_name='总页数')
|
||||
crawled_pages = models.IntegerField(default=0, verbose_name='已爬取页数')
|
||||
|
||||
class Meta:
|
||||
verbose_name = '爬取任务'
|
||||
verbose_name_plural = '爬取任务'
|
||||
ordering = ['-created_at']
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.name} - {self.get_status_display()}"
|
||||
|
||||
@property
|
||||
def progress_percentage(self):
|
||||
"""计算任务进度百分比"""
|
||||
if self.total_pages == 0:
|
||||
return 0
|
||||
return round((self.crawled_pages / self.total_pages) * 100, 2)
|
||||
|
||||
|
||||
class CrawledContent(models.Model):
|
||||
"""爬取内容模型"""
|
||||
task = models.ForeignKey(CrawlTask, on_delete=models.CASCADE, related_name='contents', verbose_name='所属任务')
|
||||
website = models.ForeignKey(Website, on_delete=models.CASCADE, verbose_name='来源网站')
|
||||
title = models.CharField(max_length=500, verbose_name='标题')
|
||||
content = models.TextField(verbose_name='内容')
|
||||
url = models.URLField(verbose_name='原文链接')
|
||||
publish_date = models.DateTimeField(null=True, blank=True, verbose_name='发布时间')
|
||||
author = models.CharField(max_length=100, blank=True, verbose_name='作者')
|
||||
keywords_matched = models.TextField(verbose_name='匹配的关键字', help_text='匹配到的关键字,用逗号分隔')
|
||||
created_at = models.DateTimeField(auto_now_add=True, verbose_name='爬取时间')
|
||||
|
||||
# 添加本地存储字段
|
||||
local_file = models.FileField(upload_to=crawled_content_file_path, blank=True, null=True, verbose_name='本地文件')
|
||||
is_local_saved = models.BooleanField(default=False, verbose_name='是否已本地保存')
|
||||
|
||||
class Meta:
|
||||
verbose_name = '爬取内容'
|
||||
verbose_name_plural = '爬取内容'
|
||||
ordering = ['-created_at']
|
||||
indexes = [
|
||||
models.Index(fields=['task', 'website']),
|
||||
models.Index(fields=['created_at']),
|
||||
models.Index(fields=['publish_date']),
|
||||
]
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.website.name} - {self.title[:50]}"
|
||||
|
||||
def get_preview_content(self, max_length=500):
|
||||
"""获取预览内容"""
|
||||
if len(self.content) <= max_length:
|
||||
return self.content
|
||||
return self.content[:max_length] + '...'
|
||||
|
||||
|
||||
class MediaFile(models.Model):
|
||||
"""媒体文件模型"""
|
||||
MEDIA_TYPE_CHOICES = [
|
||||
('image', '图片'),
|
||||
('video', '视频'),
|
||||
('audio', '音频'),
|
||||
('document', '文档'),
|
||||
]
|
||||
|
||||
content = models.ForeignKey(CrawledContent, on_delete=models.CASCADE, related_name='media_files', verbose_name='所属内容')
|
||||
media_type = models.CharField(max_length=20, choices=MEDIA_TYPE_CHOICES, verbose_name='媒体类型')
|
||||
original_url = models.URLField(verbose_name='原始URL')
|
||||
local_file = models.FileField(upload_to=media_file_path, verbose_name='本地文件')
|
||||
file_size = models.BigIntegerField(null=True, blank=True, verbose_name='文件大小(字节)')
|
||||
mime_type = models.CharField(max_length=100, blank=True, verbose_name='MIME类型')
|
||||
alt_text = models.CharField(max_length=500, blank=True, verbose_name='替代文本')
|
||||
created_at = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')
|
||||
|
||||
class Meta:
|
||||
verbose_name = '媒体文件'
|
||||
verbose_name_plural = '媒体文件'
|
||||
ordering = ['-created_at']
|
||||
indexes = [
|
||||
models.Index(fields=['content', 'media_type']),
|
||||
models.Index(fields=['created_at']),
|
||||
]
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.get_media_type_display()} - {self.original_url}"
|
||||
|
||||
@property
|
||||
def file_size_display(self):
|
||||
"""显示文件大小"""
|
||||
if not self.file_size:
|
||||
return "未知"
|
||||
|
||||
size = self.file_size
|
||||
for unit in ['B', 'KB', 'MB', 'GB']:
|
||||
if size < 1024.0:
|
||||
return f"{size:.1f} {unit}"
|
||||
size /= 1024.0
|
||||
return f"{size:.1f} TB"
|
||||
|
||||
|
||||
class CrawlLog(models.Model):
|
||||
"""爬取日志模型"""
|
||||
LOG_LEVEL_CHOICES = [
|
||||
('info', '信息'),
|
||||
('warning', '警告'),
|
||||
('error', '错误'),
|
||||
('debug', '调试'),
|
||||
]
|
||||
|
||||
task = models.ForeignKey(CrawlTask, on_delete=models.CASCADE, related_name='logs', verbose_name='所属任务')
|
||||
website = models.ForeignKey(Website, on_delete=models.CASCADE, null=True, blank=True, verbose_name='相关网站')
|
||||
level = models.CharField(max_length=20, choices=LOG_LEVEL_CHOICES, verbose_name='日志级别')
|
||||
message = models.TextField(verbose_name='日志消息')
|
||||
created_at = models.DateTimeField(auto_now_add=True, verbose_name='记录时间')
|
||||
|
||||
class Meta:
|
||||
verbose_name = '爬取日志'
|
||||
verbose_name_plural = '爬取日志'
|
||||
ordering = ['-created_at']
|
||||
|
||||
def __str__(self):
|
||||
return f"[{self.get_level_display()}] {self.message[:100]}"
|
||||
|
||||
|
||||
class SearchKeyword(models.Model):
|
||||
"""搜索关键字模型"""
|
||||
keyword = models.CharField(max_length=100, unique=True, verbose_name='关键字')
|
||||
is_active = models.BooleanField(default=True, verbose_name='是否启用')
|
||||
created_at = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')
|
||||
last_used = models.DateTimeField(null=True, blank=True, verbose_name='最后使用时间')
|
||||
|
||||
class Meta:
|
||||
verbose_name = '搜索关键字'
|
||||
verbose_name_plural = '搜索关键字'
|
||||
ordering = ['-last_used', '-created_at']
|
||||
|
||||
def __str__(self):
|
||||
return self.keyword
|
||||
Reference in New Issue
Block a user