{formatted_content}
+From e51154bb29bd9a3ff4868263aeb7379054386be4 Mon Sep 17 00:00:00 2001
From: yuangyaa 监控网站 爬取任务 爬取内容 运行中任务 {{ content.content|truncatechars:100 }} 暂无爬取内容
+ 关键字: {{ task.keywords|truncatechars:40 }}
+ 暂无任务 {{ content.content|truncatechars:200 }} 没有找到包含 "{{ keyword }}" 的内容 请尝试其他关键字或检查爬取任务是否正常运行 在上方输入框中输入关键字,搜索已爬取的内容 原始URL: {media_file.original_url} 文件大小: {media_file.file_size_display} 原始URL: {media_file.original_url} 文件大小: {media_file.file_size_display}
'
+ '
'
+ '大小: {}'
+ '
'
+ '
'
+ '大小: {}'
+ '
'
+ '大小: {}'
+ '',
+ obj.local_file.name
+ )
+ elif obj.media_type == 'video' and obj.local_file:
+ return format_html(
+ '',
+ obj.local_file.name,
+ obj.mime_type
+ )
+ elif obj.media_type == 'audio' and obj.local_file:
+ return format_html(
+ '',
+ obj.local_file.name,
+ obj.mime_type
+ )
+ else:
+ return "无预览"
+ media_preview.short_description = '预览'
+
+
+@admin.register(SearchKeyword)
+class SearchKeywordAdmin(admin.ModelAdmin):
+ list_display = ['keyword', 'is_active', 'created_at', 'last_used']
+ list_filter = ['is_active', 'created_at', 'last_used']
+ search_fields = ['keyword']
+ list_editable = ['is_active']
+ ordering = ['-last_used', '-created_at']
\ No newline at end of file
diff --git a/crawler/apps.py b/crawler/apps.py
new file mode 100644
index 0000000..88478dd
--- /dev/null
+++ b/crawler/apps.py
@@ -0,0 +1,6 @@
+from django.apps import AppConfig
+
+
+class CrawlerConfig(AppConfig):
+ default_auto_field = 'django.db.models.BigAutoField'
+ name = 'crawler'
diff --git a/crawler/crawler_engine.py b/crawler/crawler_engine.py
new file mode 100644
index 0000000..4763ad8
--- /dev/null
+++ b/crawler/crawler_engine.py
@@ -0,0 +1,578 @@
+import requests
+import time
+import re
+import logging
+import os
+import urllib3
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin, urlparse
+from django.conf import settings
+from django.utils import timezone
+from django.core.files.base import ContentFile
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+from .models import Website, CrawlTask, CrawledContent, CrawlLog, SearchKeyword, MediaFile
+
+# 禁用SSL警告
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+
+# 设置日志记录器
+logger = logging.getLogger(__name__)
+
+
+class WebsiteCrawler:
+ """网站爬虫引擎"""
+
+ def __init__(self, task_id):
+ self.task = CrawlTask.objects.get(id=task_id)
+ self.keywords = [kw.strip() for kw in self.task.keywords.split(',') if kw.strip()]
+
+ # 创建带重试策略的会话
+ self.session = requests.Session()
+ self.session.headers.update({
+ 'User-Agent': settings.CRAWLER_SETTINGS['USER_AGENT']
+ })
+
+ # 设置重试策略
+ retry_strategy = Retry(
+ total=settings.CRAWLER_SETTINGS.get('MAX_RETRIES', 3),
+ backoff_factor=1,
+ status_forcelist=[429, 500, 502, 503, 504],
+ )
+ adapter = HTTPAdapter(max_retries=retry_strategy)
+ self.session.mount("http://", adapter)
+ self.session.mount("https://", adapter)
+
+ # 设置超时
+ self.timeout = settings.CRAWLER_SETTINGS['TIMEOUT']
+
+ def log(self, level, message, website=None):
+ """记录日志"""
+ CrawlLog.objects.create(
+ task=self.task,
+ website=website,
+ level=level,
+ message=message
+ )
+ # 同时记录到Python日志系统
+ logger.log(getattr(logging, level.upper()), f"Task {self.task.id}: {message}")
+
+ def update_task_status(self, status, **kwargs):
+ """更新任务状态"""
+ self.task.status = status
+ if status == 'running' and not self.task.started_at:
+ self.task.started_at = timezone.now()
+ elif status in ['completed', 'failed', 'cancelled']:
+ self.task.completed_at = timezone.now()
+
+ for key, value in kwargs.items():
+ setattr(self.task, key, value)
+ self.task.save()
+
+ def extract_text_content(self, soup):
+ """提取文本内容,保持段落结构"""
+ # 移除脚本和样式标签
+ for script in soup(["script", "style"]):
+ script.decompose()
+
+ # 处理段落标签,保持段落结构
+ paragraphs = []
+
+ # 查找所有段落相关的标签
+ for element in soup.find_all(['p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'br']):
+ if element.name in ['p', 'div']:
+ text = element.get_text().strip()
+ if text:
+ paragraphs.append(text)
+ elif element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
+ text = element.get_text().strip()
+ if text:
+ paragraphs.append(f"\n{text}\n") # 标题前后加换行
+ elif element.name == 'br':
+ paragraphs.append('\n')
+
+ # 如果没有找到段落标签,使用原来的方法
+ if not paragraphs:
+ text = soup.get_text()
+ # 清理文本但保持换行
+ lines = []
+ for line in text.splitlines():
+ line = line.strip()
+ if line:
+ lines.append(line)
+ return '\n\n'.join(lines)
+
+ # 合并段落,用双换行分隔
+ content = '\n\n'.join(paragraphs)
+
+ # 清理多余的空行
+ import re
+ content = re.sub(r'\n\s*\n\s*\n', '\n\n', content)
+
+ return content.strip()
+
+ def find_article_links(self, soup, base_url):
+ """查找文章链接"""
+ links = []
+
+ # 常见的文章链接选择器
+ selectors = [
+ 'a[href*="article"]',
+ 'a[href*="news"]',
+ 'a[href*="content"]',
+ 'a[href*="detail"]',
+ 'a[href*="view"]',
+ 'a[href*="show"]',
+ '.news-list a',
+ '.article-list a',
+ '.content-list a',
+ 'h3 a',
+ 'h4 a',
+ '.title a',
+ '.list-item a'
+ ]
+
+ for selector in selectors:
+ elements = soup.select(selector)
+ for element in elements:
+ href = element.get('href')
+ if href:
+ full_url = urljoin(base_url, href)
+ title = element.get_text().strip()
+ if title and len(title) > 5: # 过滤掉太短的标题
+ links.append({
+ 'url': full_url,
+ 'title': title
+ })
+
+ return links
+
+ def check_keyword_match(self, text, title):
+ """检查关键字匹配"""
+ matched_keywords = []
+ text_lower = text.lower()
+ title_lower = title.lower()
+
+ for keyword in self.keywords:
+ keyword_lower = keyword.lower()
+ if keyword_lower in text_lower or keyword_lower in title_lower:
+ matched_keywords.append(keyword)
+
+ return matched_keywords
+
+ def extract_article_content(self, url, soup):
+ """提取文章内容"""
+ # 尝试多种内容选择器
+ content_selectors = [
+ '.article-content',
+ '.content',
+ '.article-body',
+ '.news-content',
+ '.main-content',
+ '.post-content',
+ 'article',
+ '.detail-content',
+ '#content',
+ '.text'
+ ]
+
+ content = ""
+ for selector in content_selectors:
+ element = soup.select_one(selector)
+ if element:
+ content = self.extract_text_content(element)
+ if len(content) > 100: # 确保内容足够长
+ break
+
+ # 如果没找到特定内容区域,使用整个页面
+ if not content or len(content) < 100:
+ content = self.extract_text_content(soup)
+
+ return content
+
+ def extract_publish_date(self, soup):
+ """提取发布时间"""
+ date_selectors = [
+ '.publish-time',
+ '.pub-time',
+ '.date',
+ '.time',
+ '.publish-date',
+ 'time[datetime]',
+ '.article-time',
+ '.news-time'
+ ]
+
+ for selector in date_selectors:
+ element = soup.select_one(selector)
+ if element:
+ date_text = element.get_text().strip()
+ if element.get('datetime'):
+ date_text = element.get('datetime')
+
+ # 尝试解析日期
+ try:
+ from datetime import datetime
+ # 这里可以添加更复杂的日期解析逻辑
+ # 暂时返回当前时间
+ return timezone.now()
+ except:
+ continue
+
+ return None
+
+ def extract_author(self, soup):
+ """提取作者信息"""
+ author_selectors = [
+ '.author',
+ '.writer',
+ '.publisher',
+ '.byline',
+ '.article-author',
+ '.news-author'
+ ]
+
+ for selector in author_selectors:
+ element = soup.select_one(selector)
+ if element:
+ return element.get_text().strip()
+
+ return ""
+
+ def download_media_file(self, media_url, crawled_content, media_type='image', alt_text=''):
+ """下载媒体文件"""
+ try:
+ # 检查URL是否有效
+ if not media_url or not media_url.startswith(('http://', 'https://')):
+ return None
+
+ # 请求媒体文件
+ response = self.session.get(
+ media_url,
+ timeout=self.timeout,
+ verify=False,
+ stream=False # 改为False以确保获取完整内容
+ )
+ response.raise_for_status()
+
+ # 获取文件信息
+ content_type = response.headers.get('content-type', '')
+ content_length = response.headers.get('content-length')
+ file_size = int(content_length) if content_length else len(response.content)
+
+ # 确定文件扩展名
+ file_extension = self.get_file_extension_from_url(media_url, content_type)
+
+ # 生成文件名
+ filename = f"media_{crawled_content.id}_{len(crawled_content.media_files.all())}{file_extension}"
+
+ # 创建媒体文件对象
+ media_file = MediaFile.objects.create(
+ content=crawled_content,
+ media_type=media_type,
+ original_url=media_url,
+ file_size=file_size,
+ mime_type=content_type,
+ alt_text=alt_text
+ )
+
+ # 保存文件
+ media_file.local_file.save(
+ filename,
+ ContentFile(response.content),
+ save=True
+ )
+
+ self.log('info', f'媒体文件已下载: {filename} ({media_type})', crawled_content.website)
+ return media_file
+
+ except Exception as e:
+ self.log('error', f'下载媒体文件失败 {media_url}: {str(e)}', crawled_content.website)
+ return None
+
+ def get_file_extension_from_url(self, url, content_type):
+ """从URL或内容类型获取文件扩展名"""
+ # 从URL获取扩展名
+ parsed_url = urlparse(url)
+ path = parsed_url.path
+ if '.' in path:
+ return os.path.splitext(path)[1]
+
+ # 从内容类型获取扩展名
+ content_type_map = {
+ 'image/jpeg': '.jpg',
+ 'image/jpg': '.jpg',
+ 'image/png': '.png',
+ 'image/gif': '.gif',
+ 'image/webp': '.webp',
+ 'image/svg+xml': '.svg',
+ 'video/mp4': '.mp4',
+ 'video/avi': '.avi',
+ 'video/mov': '.mov',
+ 'video/wmv': '.wmv',
+ 'video/flv': '.flv',
+ 'video/webm': '.webm',
+ 'audio/mp3': '.mp3',
+ 'audio/wav': '.wav',
+ 'audio/ogg': '.ogg',
+ 'application/pdf': '.pdf',
+ 'application/msword': '.doc',
+ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
+ }
+
+ return content_type_map.get(content_type.lower(), '.bin')
+
+ def extract_and_download_media(self, soup, crawled_content, base_url):
+ """提取并下载页面中的媒体文件"""
+ media_files = []
+
+ # 提取图片
+ images = soup.find_all('img')
+ self.log('info', f'找到 {len(images)} 个图片标签', crawled_content.website)
+
+ for img in images:
+ src = img.get('src')
+ if src:
+ # 处理相对URL
+ if src.startswith('//'):
+ src = 'https:' + src
+ elif src.startswith('/'):
+ src = urljoin(base_url, src)
+ elif not src.startswith(('http://', 'https://')):
+ src = urljoin(base_url, src)
+
+ alt_text = img.get('alt', '')
+ self.log('info', f'尝试下载图片: {src}', crawled_content.website)
+ media_file = self.download_media_file(src, crawled_content, 'image', alt_text)
+ if media_file:
+ media_files.append(media_file)
+ self.log('info', f'成功下载图片: {media_file.local_file.name}', crawled_content.website)
+
+ # 提取视频
+ videos = soup.find_all(['video', 'source'])
+ for video in videos:
+ src = video.get('src')
+ if src:
+ # 处理相对URL
+ if src.startswith('//'):
+ src = 'https:' + src
+ elif src.startswith('/'):
+ src = urljoin(base_url, src)
+ elif not src.startswith(('http://', 'https://')):
+ src = urljoin(base_url, src)
+
+ media_file = self.download_media_file(src, crawled_content, 'video')
+ if media_file:
+ media_files.append(media_file)
+
+ # 提取音频
+ audios = soup.find_all('audio')
+ for audio in audios:
+ src = audio.get('src')
+ if src:
+ # 处理相对URL
+ if src.startswith('//'):
+ src = 'https:' + src
+ elif src.startswith('/'):
+ src = urljoin(base_url, src)
+ elif not src.startswith(('http://', 'https://')):
+ src = urljoin(base_url, src)
+
+ media_file = self.download_media_file(src, crawled_content, 'audio')
+ if media_file:
+ media_files.append(media_file)
+
+ return media_files
+
+ def mark_content_saved(self, crawled_content):
+ """标记内容已保存(内容已存储在数据库中)"""
+ try:
+ crawled_content.is_local_saved = True
+ crawled_content.save()
+
+ media_count = crawled_content.media_files.count()
+ self.log('info', f'文章内容已保存到数据库 (包含 {media_count} 个媒体文件)', crawled_content.website)
+ return True
+ except Exception as e:
+ self.log('error', f'标记内容保存状态失败: {str(e)}', crawled_content.website)
+ return False
+
+ def crawl_website(self, website):
+ """爬取单个网站"""
+ self.log('info', f'开始爬取网站: {website.name}', website)
+
+ try:
+ # 请求主页
+ response = self.session.get(
+ website.url,
+ timeout=self.timeout,
+ verify=False # 忽略SSL证书验证
+ )
+ response.raise_for_status()
+
+ # 检查内容编码
+ if response.encoding != 'utf-8':
+ # 尝试从响应头获取编码
+ content_type = response.headers.get('content-type', '')
+ if 'charset=' in content_type:
+ charset = content_type.split('charset=')[-1]
+ response.encoding = charset
+ else:
+ response.encoding = 'utf-8'
+
+ soup = BeautifulSoup(response.content, 'html.parser')
+
+ # 查找文章链接
+ article_links = self.find_article_links(soup, website.url)
+ self.log('info', f'找到 {len(article_links)} 个文章链接', website)
+
+ crawled_count = 0
+ for link_info in article_links:
+ try:
+ # 请求文章页面
+ article_response = self.session.get(
+ link_info['url'],
+ timeout=self.timeout,
+ verify=False # 忽略SSL证书验证
+ )
+ article_response.raise_for_status()
+
+ # 检查内容编码
+ if article_response.encoding != 'utf-8':
+ # 尝试从响应头获取编码
+ content_type = article_response.headers.get('content-type', '')
+ if 'charset=' in content_type:
+ charset = content_type.split('charset=')[-1]
+ article_response.encoding = charset
+ else:
+ article_response.encoding = 'utf-8'
+
+ article_soup = BeautifulSoup(article_response.content, 'html.parser')
+
+ # 提取内容
+ content = self.extract_article_content(link_info['url'], article_soup)
+ title = link_info['title']
+
+ # 检查关键字匹配
+ matched_keywords = self.check_keyword_match(content, title)
+
+ if matched_keywords:
+ # 提取其他信息
+ publish_date = self.extract_publish_date(article_soup)
+ author = self.extract_author(article_soup)
+
+ # 保存内容
+ crawled_content = CrawledContent.objects.create(
+ task=self.task,
+ website=website,
+ title=title,
+ content=content,
+ url=link_info['url'],
+ publish_date=publish_date,
+ author=author,
+ keywords_matched=','.join(matched_keywords),
+ is_local_saved=False # 初始设置为False,保存到本地后会更新为True
+ )
+
+ # 提取并下载媒体文件
+ media_files = self.extract_and_download_media(article_soup, crawled_content, link_info['url'])
+
+ # 标记内容已保存
+ self.mark_content_saved(crawled_content)
+
+ crawled_count += 1
+ self.log('info', f'保存文章: {title[:50]}...', website)
+
+ # 请求间隔
+ time.sleep(settings.CRAWLER_SETTINGS['REQUEST_DELAY'])
+
+ except requests.exceptions.SSLError as e:
+ self.log('error', f'SSL错误,跳过文章 {link_info["url"]}: {str(e)}', website)
+ continue
+ except requests.exceptions.ConnectionError as e:
+ self.log('error', f'连接错误,跳过文章 {link_info["url"]}: {str(e)}', website)
+ continue
+ except requests.exceptions.Timeout as e:
+ self.log('error', f'请求超时,跳过文章 {link_info["url"]}: {str(e)}', website)
+ continue
+ except requests.exceptions.RequestException as e:
+ self.log('error', f'网络请求错误,跳过文章 {link_info["url"]}: {str(e)}', website)
+ continue
+ except UnicodeDecodeError as e:
+ self.log('error', f'字符编码错误,跳过文章 {link_info["url"]}: {str(e)}', website)
+ continue
+ except Exception as e:
+ self.log('error', f'处理文章失败 {link_info["url"]}: {str(e)}', website)
+ continue
+
+ self.log('info', f'网站爬取完成,共保存 {crawled_count} 篇文章', website)
+ return crawled_count
+
+ except requests.exceptions.SSLError as e:
+ self.log('error', f'爬取网站SSL错误: {str(e)}', website)
+ return 0
+ except requests.exceptions.ConnectionError as e:
+ self.log('error', f'爬取网站连接错误: {str(e)}', website)
+ return 0
+ except requests.exceptions.Timeout as e:
+ self.log('error', f'爬取网站超时: {str(e)}', website)
+ return 0
+ except requests.exceptions.RequestException as e:
+ self.log('error', f'爬取网站网络错误: {str(e)}', website)
+ return 0
+ except Exception as e:
+ self.log('error', f'爬取网站失败: {str(e)}', website)
+ return 0
+
+ def run(self):
+ """运行爬取任务"""
+ self.log('info', f'开始执行爬取任务: {self.task.name}')
+ self.update_task_status('running')
+
+ total_crawled = 0
+ websites = self.task.websites.filter(is_active=True)
+ self.task.total_pages = websites.count()
+ self.task.save()
+
+ for website in websites:
+ try:
+ crawled_count = self.crawl_website(website)
+ total_crawled += crawled_count
+ self.task.crawled_pages += 1
+ self.task.save()
+
+ except Exception as e:
+ self.log('error', f'爬取网站 {website.name} 时发生错误: {str(e)}', website)
+ continue
+
+ # 更新任务状态
+ if total_crawled > 0:
+ self.update_task_status('completed')
+ self.log('info', f'爬取任务完成,共爬取 {total_crawled} 篇文章')
+ else:
+ self.update_task_status('failed', error_message='没有找到匹配的内容')
+ self.log('error', '爬取任务失败,没有找到匹配的内容')
+
+
+def run_crawl_task(task_id):
+ """运行爬取任务(Celery任务)"""
+ try:
+ crawler = WebsiteCrawler(task_id)
+ crawler.run()
+ return f"任务 {task_id} 执行完成"
+ except Exception as e:
+ # 记录异常到日志
+ logger.error(f"执行任务 {task_id} 时发生异常: {str(e)}", exc_info=True)
+
+ task = CrawlTask.objects.get(id=task_id)
+ task.status = 'failed'
+ task.error_message = str(e)
+ task.completed_at = timezone.now()
+ task.save()
+
+ CrawlLog.objects.create(
+ task=task,
+ level='error',
+ message=f'任务执行失败: {str(e)}'
+ )
+ return f"任务 {task_id} 执行失败: {str(e)}"
\ No newline at end of file
diff --git a/crawler/management/__init__.py b/crawler/management/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/crawler/management/commands/__init__.py b/crawler/management/commands/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/crawler/management/commands/init_websites.py b/crawler/management/commands/init_websites.py
new file mode 100644
index 0000000..d0e01e1
--- /dev/null
+++ b/crawler/management/commands/init_websites.py
@@ -0,0 +1,36 @@
+from django.core.management.base import BaseCommand
+from django.conf import settings
+from crawler.models import Website
+
+
+class Command(BaseCommand):
+ help = '初始化目标网站数据'
+
+ def handle(self, *args, **options):
+ self.stdout.write('开始初始化目标网站数据...')
+
+ # 清空现有数据
+ Website.objects.all().delete()
+
+ # 从设置中获取网站列表
+ websites_data = settings.TARGET_WEBSITES
+
+ created_count = 0
+ for website_data in websites_data:
+ website, created = Website.objects.get_or_create(
+ url=website_data['url'],
+ defaults={
+ 'name': website_data['name'],
+ 'region': website_data['region'],
+ 'is_active': True
+ }
+ )
+ if created:
+ created_count += 1
+ self.stdout.write(f'创建网站: {website.name}')
+ else:
+ self.stdout.write(f'网站已存在: {website.name}')
+
+ self.stdout.write(
+ self.style.SUCCESS(f'成功初始化 {created_count} 个网站')
+ )
diff --git a/crawler/management/commands/run_crawler.py b/crawler/management/commands/run_crawler.py
new file mode 100644
index 0000000..efeaa54
--- /dev/null
+++ b/crawler/management/commands/run_crawler.py
@@ -0,0 +1,69 @@
+from django.core.management.base import BaseCommand
+from crawler.models import CrawlTask, Website
+from crawler.tasks import crawl_websites_task
+
+
+class Command(BaseCommand):
+ help = '运行爬虫任务'
+
+ def add_arguments(self, parser):
+ parser.add_argument(
+ '--keywords',
+ type=str,
+ required=True,
+ help='搜索关键字,多个关键字用逗号分隔'
+ )
+ parser.add_argument(
+ '--websites',
+ type=str,
+ help='网站ID列表,用逗号分隔。不指定则爬取所有网站'
+ )
+ parser.add_argument(
+ '--name',
+ type=str,
+ help='任务名称'
+ )
+
+ def handle(self, *args, **options):
+ keywords = options['keywords']
+ website_ids = options.get('websites')
+ task_name = options.get('name', f'关键字搜索: {keywords}')
+
+ # 获取目标网站
+ if website_ids:
+ website_id_list = [int(id.strip()) for id in website_ids.split(',')]
+ websites = Website.objects.filter(id__in=website_id_list, is_active=True)
+ else:
+ websites = Website.objects.filter(is_active=True)
+
+ if not websites.exists():
+ self.stdout.write(
+ self.style.ERROR('没有找到可用的网站')
+ )
+ return
+
+ # 创建任务
+ task = CrawlTask.objects.create(
+ name=task_name,
+ keywords=keywords,
+ created_by='management_command'
+ )
+ task.websites.set(websites)
+
+ self.stdout.write(f'创建任务: {task.name}')
+ self.stdout.write(f'目标网站: {websites.count()} 个')
+ self.stdout.write(f'搜索关键字: {keywords}')
+
+ # 启动任务(同步模式,如果没有Redis则直接运行)
+ try:
+ crawl_websites_task.delay(task.id)
+ self.stdout.write('任务已提交到队列')
+ except Exception as e:
+ self.stdout.write(f'队列不可用,直接运行任务: {e}')
+ from crawler.crawler_engine import WebsiteCrawler
+ crawler = WebsiteCrawler(task.id)
+ crawler.run()
+
+ self.stdout.write(
+ self.style.SUCCESS(f'任务已启动,任务ID: {task.id}')
+ )
diff --git a/crawler/migrations/0001_initial.py b/crawler/migrations/0001_initial.py
new file mode 100644
index 0000000..eba286f
--- /dev/null
+++ b/crawler/migrations/0001_initial.py
@@ -0,0 +1,106 @@
+# Generated by Django 5.2.6 on 2025-09-22 16:27
+
+import django.db.models.deletion
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ initial = True
+
+ dependencies = [
+ ]
+
+ operations = [
+ migrations.CreateModel(
+ name='SearchKeyword',
+ fields=[
+ ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+ ('keyword', models.CharField(max_length=100, unique=True, verbose_name='关键字')),
+ ('is_active', models.BooleanField(default=True, verbose_name='是否启用')),
+ ('created_at', models.DateTimeField(auto_now_add=True, verbose_name='创建时间')),
+ ('last_used', models.DateTimeField(blank=True, null=True, verbose_name='最后使用时间')),
+ ],
+ options={
+ 'verbose_name': '搜索关键字',
+ 'verbose_name_plural': '搜索关键字',
+ 'ordering': ['-last_used', '-created_at'],
+ },
+ ),
+ migrations.CreateModel(
+ name='Website',
+ fields=[
+ ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+ ('name', models.CharField(max_length=100, verbose_name='网站名称')),
+ ('url', models.URLField(verbose_name='网站地址')),
+ ('region', models.CharField(max_length=50, verbose_name='所属地区')),
+ ('is_active', models.BooleanField(default=True, verbose_name='是否启用')),
+ ('created_at', models.DateTimeField(auto_now_add=True, verbose_name='创建时间')),
+ ('updated_at', models.DateTimeField(auto_now=True, verbose_name='更新时间')),
+ ],
+ options={
+ 'verbose_name': '目标网站',
+ 'verbose_name_plural': '目标网站',
+ 'ordering': ['region', 'name'],
+ },
+ ),
+ migrations.CreateModel(
+ name='CrawlTask',
+ fields=[
+ ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+ ('name', models.CharField(max_length=200, verbose_name='任务名称')),
+ ('keywords', models.TextField(help_text='多个关键字用逗号分隔', verbose_name='搜索关键字')),
+ ('status', models.CharField(choices=[('pending', '待执行'), ('running', '执行中'), ('completed', '已完成'), ('failed', '执行失败'), ('cancelled', '已取消')], default='pending', max_length=20, verbose_name='任务状态')),
+ ('created_by', models.CharField(default='system', max_length=100, verbose_name='创建者')),
+ ('created_at', models.DateTimeField(auto_now_add=True, verbose_name='创建时间')),
+ ('started_at', models.DateTimeField(blank=True, null=True, verbose_name='开始时间')),
+ ('completed_at', models.DateTimeField(blank=True, null=True, verbose_name='完成时间')),
+ ('error_message', models.TextField(blank=True, verbose_name='错误信息')),
+ ('total_pages', models.IntegerField(default=0, verbose_name='总页数')),
+ ('crawled_pages', models.IntegerField(default=0, verbose_name='已爬取页数')),
+ ('websites', models.ManyToManyField(to='crawler.website', verbose_name='目标网站')),
+ ],
+ options={
+ 'verbose_name': '爬取任务',
+ 'verbose_name_plural': '爬取任务',
+ 'ordering': ['-created_at'],
+ },
+ ),
+ migrations.CreateModel(
+ name='CrawlLog',
+ fields=[
+ ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+ ('level', models.CharField(choices=[('info', '信息'), ('warning', '警告'), ('error', '错误'), ('debug', '调试')], max_length=20, verbose_name='日志级别')),
+ ('message', models.TextField(verbose_name='日志消息')),
+ ('created_at', models.DateTimeField(auto_now_add=True, verbose_name='记录时间')),
+ ('task', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='logs', to='crawler.crawltask', verbose_name='所属任务')),
+ ('website', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, to='crawler.website', verbose_name='相关网站')),
+ ],
+ options={
+ 'verbose_name': '爬取日志',
+ 'verbose_name_plural': '爬取日志',
+ 'ordering': ['-created_at'],
+ },
+ ),
+ migrations.CreateModel(
+ name='CrawledContent',
+ fields=[
+ ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+ ('title', models.CharField(max_length=500, verbose_name='标题')),
+ ('content', models.TextField(verbose_name='内容')),
+ ('url', models.URLField(verbose_name='原文链接')),
+ ('publish_date', models.DateTimeField(blank=True, null=True, verbose_name='发布时间')),
+ ('author', models.CharField(blank=True, max_length=100, verbose_name='作者')),
+ ('keywords_matched', models.TextField(help_text='匹配到的关键字,用逗号分隔', verbose_name='匹配的关键字')),
+ ('created_at', models.DateTimeField(auto_now_add=True, verbose_name='爬取时间')),
+ ('task', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='contents', to='crawler.crawltask', verbose_name='所属任务')),
+ ('website', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='crawler.website', verbose_name='来源网站')),
+ ],
+ options={
+ 'verbose_name': '爬取内容',
+ 'verbose_name_plural': '爬取内容',
+ 'ordering': ['-created_at'],
+ 'indexes': [models.Index(fields=['task', 'website'], name='crawler_cra_task_id_6244e7_idx'), models.Index(fields=['created_at'], name='crawler_cra_created_a116d2_idx'), models.Index(fields=['publish_date'], name='crawler_cra_publish_5b8ccc_idx')],
+ },
+ ),
+ ]
diff --git a/crawler/migrations/0002_crawledcontent_is_local_saved_and_more.py b/crawler/migrations/0002_crawledcontent_is_local_saved_and_more.py
new file mode 100644
index 0000000..dbe2d0a
--- /dev/null
+++ b/crawler/migrations/0002_crawledcontent_is_local_saved_and_more.py
@@ -0,0 +1,24 @@
+# Generated by Django 5.2.6 on 2025-09-23 00:38
+
+import crawler.models
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('crawler', '0001_initial'),
+ ]
+
+ operations = [
+ migrations.AddField(
+ model_name='crawledcontent',
+ name='is_local_saved',
+ field=models.BooleanField(default=False, verbose_name='是否已本地保存'),
+ ),
+ migrations.AddField(
+ model_name='crawledcontent',
+ name='local_file',
+ field=models.FileField(blank=True, null=True, upload_to=crawler.models.crawled_content_file_path, verbose_name='本地文件'),
+ ),
+ ]
diff --git a/crawler/migrations/0003_mediafile.py b/crawler/migrations/0003_mediafile.py
new file mode 100644
index 0000000..323b497
--- /dev/null
+++ b/crawler/migrations/0003_mediafile.py
@@ -0,0 +1,35 @@
+# Generated by Django 5.2.6 on 2025-09-23 01:05
+
+import crawler.models
+import django.db.models.deletion
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('crawler', '0002_crawledcontent_is_local_saved_and_more'),
+ ]
+
+ operations = [
+ migrations.CreateModel(
+ name='MediaFile',
+ fields=[
+ ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+ ('media_type', models.CharField(choices=[('image', '图片'), ('video', '视频'), ('audio', '音频'), ('document', '文档')], max_length=20, verbose_name='媒体类型')),
+ ('original_url', models.URLField(verbose_name='原始URL')),
+ ('local_file', models.FileField(upload_to=crawler.models.media_file_path, verbose_name='本地文件')),
+ ('file_size', models.BigIntegerField(blank=True, null=True, verbose_name='文件大小(字节)')),
+ ('mime_type', models.CharField(blank=True, max_length=100, verbose_name='MIME类型')),
+ ('alt_text', models.CharField(blank=True, max_length=500, verbose_name='替代文本')),
+ ('created_at', models.DateTimeField(auto_now_add=True, verbose_name='创建时间')),
+ ('content', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='media_files', to='crawler.crawledcontent', verbose_name='所属内容')),
+ ],
+ options={
+ 'verbose_name': '媒体文件',
+ 'verbose_name_plural': '媒体文件',
+ 'ordering': ['-created_at'],
+ 'indexes': [models.Index(fields=['content', 'media_type'], name='crawler_med_content_3a9468_idx'), models.Index(fields=['created_at'], name='crawler_med_created_13ff00_idx')],
+ },
+ ),
+ ]
diff --git a/crawler/migrations/__init__.py b/crawler/migrations/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/crawler/models.py b/crawler/models.py
new file mode 100644
index 0000000..bfa6ebb
--- /dev/null
+++ b/crawler/models.py
@@ -0,0 +1,195 @@
+from django.db import models
+from django.utils import timezone
+import os
+
+
+def crawled_content_file_path(instance, filename):
+ """生成爬取内容文件的存储路径"""
+ # 使用任务ID和时间戳创建唯一文件名
+ timestamp = timezone.now().strftime('%Y%m%d_%H%M%S')
+ name, ext = os.path.splitext(filename)
+ return f'crawled_content/{instance.task.id}/{timestamp}_{instance.id}{ext}'
+
+
+def media_file_path(instance, filename):
+ """生成媒体文件的存储路径"""
+ # 使用任务ID和内容ID创建媒体文件路径
+ timestamp = timezone.now().strftime('%Y%m%d_%H%M%S')
+ name, ext = os.path.splitext(filename)
+ return f'{instance.content.task.id}/{instance.content.id}/{timestamp}_{name}{ext}'
+
+
+class Website(models.Model):
+ """目标网站模型"""
+ name = models.CharField(max_length=100, verbose_name='网站名称')
+ url = models.URLField(verbose_name='网站地址')
+ region = models.CharField(max_length=50, verbose_name='所属地区')
+ is_active = models.BooleanField(default=True, verbose_name='是否启用')
+ created_at = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')
+ updated_at = models.DateTimeField(auto_now=True, verbose_name='更新时间')
+
+ class Meta:
+ verbose_name = '目标网站'
+ verbose_name_plural = '目标网站'
+ ordering = ['region', 'name']
+
+ def __str__(self):
+ return f"{self.region} - {self.name}"
+
+
+class CrawlTask(models.Model):
+ """爬取任务模型"""
+ TASK_STATUS_CHOICES = [
+ ('pending', '待执行'),
+ ('running', '执行中'),
+ ('completed', '已完成'),
+ ('failed', '执行失败'),
+ ('cancelled', '已取消'),
+ ]
+
+ name = models.CharField(max_length=200, verbose_name='任务名称')
+ keywords = models.TextField(verbose_name='搜索关键字', help_text='多个关键字用逗号分隔')
+ websites = models.ManyToManyField(Website, verbose_name='目标网站')
+ status = models.CharField(max_length=20, choices=TASK_STATUS_CHOICES, default='pending', verbose_name='任务状态')
+ created_by = models.CharField(max_length=100, verbose_name='创建者', default='system')
+ created_at = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')
+ started_at = models.DateTimeField(null=True, blank=True, verbose_name='开始时间')
+ completed_at = models.DateTimeField(null=True, blank=True, verbose_name='完成时间')
+ error_message = models.TextField(blank=True, verbose_name='错误信息')
+ total_pages = models.IntegerField(default=0, verbose_name='总页数')
+ crawled_pages = models.IntegerField(default=0, verbose_name='已爬取页数')
+
+ class Meta:
+ verbose_name = '爬取任务'
+ verbose_name_plural = '爬取任务'
+ ordering = ['-created_at']
+
+ def __str__(self):
+ return f"{self.name} - {self.get_status_display()}"
+
+ @property
+ def progress_percentage(self):
+ """计算任务进度百分比"""
+ if self.total_pages == 0:
+ return 0
+ return round((self.crawled_pages / self.total_pages) * 100, 2)
+
+
+class CrawledContent(models.Model):
+ """爬取内容模型"""
+ task = models.ForeignKey(CrawlTask, on_delete=models.CASCADE, related_name='contents', verbose_name='所属任务')
+ website = models.ForeignKey(Website, on_delete=models.CASCADE, verbose_name='来源网站')
+ title = models.CharField(max_length=500, verbose_name='标题')
+ content = models.TextField(verbose_name='内容')
+ url = models.URLField(verbose_name='原文链接')
+ publish_date = models.DateTimeField(null=True, blank=True, verbose_name='发布时间')
+ author = models.CharField(max_length=100, blank=True, verbose_name='作者')
+ keywords_matched = models.TextField(verbose_name='匹配的关键字', help_text='匹配到的关键字,用逗号分隔')
+ created_at = models.DateTimeField(auto_now_add=True, verbose_name='爬取时间')
+
+ # 添加本地存储字段
+ local_file = models.FileField(upload_to=crawled_content_file_path, blank=True, null=True, verbose_name='本地文件')
+ is_local_saved = models.BooleanField(default=False, verbose_name='是否已本地保存')
+
+ class Meta:
+ verbose_name = '爬取内容'
+ verbose_name_plural = '爬取内容'
+ ordering = ['-created_at']
+ indexes = [
+ models.Index(fields=['task', 'website']),
+ models.Index(fields=['created_at']),
+ models.Index(fields=['publish_date']),
+ ]
+
+ def __str__(self):
+ return f"{self.website.name} - {self.title[:50]}"
+
+ def get_preview_content(self, max_length=500):
+ """获取预览内容"""
+ if len(self.content) <= max_length:
+ return self.content
+ return self.content[:max_length] + '...'
+
+
+class MediaFile(models.Model):
+ """媒体文件模型"""
+ MEDIA_TYPE_CHOICES = [
+ ('image', '图片'),
+ ('video', '视频'),
+ ('audio', '音频'),
+ ('document', '文档'),
+ ]
+
+ content = models.ForeignKey(CrawledContent, on_delete=models.CASCADE, related_name='media_files', verbose_name='所属内容')
+ media_type = models.CharField(max_length=20, choices=MEDIA_TYPE_CHOICES, verbose_name='媒体类型')
+ original_url = models.URLField(verbose_name='原始URL')
+ local_file = models.FileField(upload_to=media_file_path, verbose_name='本地文件')
+ file_size = models.BigIntegerField(null=True, blank=True, verbose_name='文件大小(字节)')
+ mime_type = models.CharField(max_length=100, blank=True, verbose_name='MIME类型')
+ alt_text = models.CharField(max_length=500, blank=True, verbose_name='替代文本')
+ created_at = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')
+
+ class Meta:
+ verbose_name = '媒体文件'
+ verbose_name_plural = '媒体文件'
+ ordering = ['-created_at']
+ indexes = [
+ models.Index(fields=['content', 'media_type']),
+ models.Index(fields=['created_at']),
+ ]
+
+ def __str__(self):
+ return f"{self.get_media_type_display()} - {self.original_url}"
+
+ @property
+ def file_size_display(self):
+ """显示文件大小"""
+ if not self.file_size:
+ return "未知"
+
+ size = self.file_size
+ for unit in ['B', 'KB', 'MB', 'GB']:
+ if size < 1024.0:
+ return f"{size:.1f} {unit}"
+ size /= 1024.0
+ return f"{size:.1f} TB"
+
+
+class CrawlLog(models.Model):
+ """爬取日志模型"""
+ LOG_LEVEL_CHOICES = [
+ ('info', '信息'),
+ ('warning', '警告'),
+ ('error', '错误'),
+ ('debug', '调试'),
+ ]
+
+ task = models.ForeignKey(CrawlTask, on_delete=models.CASCADE, related_name='logs', verbose_name='所属任务')
+ website = models.ForeignKey(Website, on_delete=models.CASCADE, null=True, blank=True, verbose_name='相关网站')
+ level = models.CharField(max_length=20, choices=LOG_LEVEL_CHOICES, verbose_name='日志级别')
+ message = models.TextField(verbose_name='日志消息')
+ created_at = models.DateTimeField(auto_now_add=True, verbose_name='记录时间')
+
+ class Meta:
+ verbose_name = '爬取日志'
+ verbose_name_plural = '爬取日志'
+ ordering = ['-created_at']
+
+ def __str__(self):
+ return f"[{self.get_level_display()}] {self.message[:100]}"
+
+
+class SearchKeyword(models.Model):
+ """搜索关键字模型"""
+ keyword = models.CharField(max_length=100, unique=True, verbose_name='关键字')
+ is_active = models.BooleanField(default=True, verbose_name='是否启用')
+ created_at = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')
+ last_used = models.DateTimeField(null=True, blank=True, verbose_name='最后使用时间')
+
+ class Meta:
+ verbose_name = '搜索关键字'
+ verbose_name_plural = '搜索关键字'
+ ordering = ['-last_used', '-created_at']
+
+ def __str__(self):
+ return self.keyword
\ No newline at end of file
diff --git a/crawler/serializers.py b/crawler/serializers.py
new file mode 100644
index 0000000..54f9841
--- /dev/null
+++ b/crawler/serializers.py
@@ -0,0 +1,51 @@
+from rest_framework import serializers
+from .models import Website, CrawlTask, CrawledContent, CrawlLog, SearchKeyword, MediaFile
+
+
+class WebsiteSerializer(serializers.ModelSerializer):
+ class Meta:
+ model = Website
+ fields = '__all__'
+
+
+class CrawlTaskSerializer(serializers.ModelSerializer):
+ websites = WebsiteSerializer(many=True, read_only=True)
+ progress_percentage = serializers.ReadOnlyField()
+
+ class Meta:
+ model = CrawlTask
+ fields = '__all__'
+
+
+class MediaFileSerializer(serializers.ModelSerializer):
+ file_size_display = serializers.ReadOnlyField()
+
+ class Meta:
+ model = MediaFile
+ fields = '__all__'
+
+
+class CrawledContentSerializer(serializers.ModelSerializer):
+ website_name = serializers.CharField(source='website.name', read_only=True)
+ website_region = serializers.CharField(source='website.region', read_only=True)
+ task_name = serializers.CharField(source='task.name', read_only=True)
+ media_files = MediaFileSerializer(many=True, read_only=True)
+
+ class Meta:
+ model = CrawledContent
+ fields = '__all__'
+
+
+class CrawlLogSerializer(serializers.ModelSerializer):
+ website_name = serializers.CharField(source='website.name', read_only=True)
+ task_name = serializers.CharField(source='task.name', read_only=True)
+
+ class Meta:
+ model = CrawlLog
+ fields = '__all__'
+
+
+class SearchKeywordSerializer(serializers.ModelSerializer):
+ class Meta:
+ model = SearchKeyword
+ fields = '__all__'
diff --git a/crawler/tasks.py b/crawler/tasks.py
new file mode 100644
index 0000000..c860eaa
--- /dev/null
+++ b/crawler/tasks.py
@@ -0,0 +1,36 @@
+from celery import shared_task
+from .crawler_engine import run_crawl_task as execute_crawl_task
+
+
+@shared_task
+def crawl_websites_task(task_id):
+ """爬取网站的Celery任务"""
+ return execute_crawl_task(task_id)
+
+
+@shared_task
+def run_crawl_task(task_id):
+ """执行爬取任务的Celery任务(为管理界面提供)"""
+ return execute_crawl_task(task_id)
+
+
+@shared_task
+def cleanup_old_tasks():
+ """清理旧任务(保留最近30天的任务)"""
+ from django.utils import timezone
+ from datetime import timedelta
+ from .models import CrawlTask, CrawlLog, CrawledContent
+
+ cutoff_date = timezone.now() - timedelta(days=30)
+
+ # 删除30天前的任务及其相关数据
+ old_tasks = CrawlTask.objects.filter(created_at__lt=cutoff_date)
+ count = old_tasks.count()
+
+ for task in old_tasks:
+ # 删除相关的内容和日志
+ CrawledContent.objects.filter(task=task).delete()
+ CrawlLog.objects.filter(task=task).delete()
+ task.delete()
+
+ return f"清理了 {count} 个旧任务"
\ No newline at end of file
diff --git a/crawler/templates/crawler/base.html b/crawler/templates/crawler/base.html
new file mode 100644
index 0000000..7f5406e
--- /dev/null
+++ b/crawler/templates/crawler/base.html
@@ -0,0 +1,80 @@
+
+
+
+
+
+
+ 系统仪表板
+
+ {{ stats.total_websites }}
+ {{ stats.total_tasks }}
+ {{ stats.total_contents }}
+ {{ stats.active_tasks }}
+
+ {{ website_name }}
+ {{ contents|length }}
+
+
+ {% if content.is_local_saved %}
+
+ {{ content.title|truncatechars:60 }}
+
+ {% else %}
+
+ {{ content.title|truncatechars:60 }}
+
+ {% endif %}
+
+ {{ content.created_at|date:"m-d H:i" }}
+
+ 最近的任务
+
+ {{ task.name|truncatechars:30 }}
+
+ {{ task.get_status_display }}
+
+
+ 内容搜索
+
+
+ 搜索结果
+ {% if contents %}
+ {{ contents|length }} 条结果
+ {% endif %}
+
+
+ {% if content.is_local_saved %}
+
+ {{ content.title }}
+
+ {% else %}
+
+ {{ content.title }}
+
+ {% endif %}
+
+ {{ content.created_at|date:"Y-m-d H:i" }}
+ 开始搜索
+ 搜索建议:
+ 媒体文件
+"""
+ for media_file in media_files:
+ if media_file.media_type == 'image':
+ media_section += f"""
+ 图片: {media_file.alt_text or '无标题'}
+
+
文件: {media_file.get_media_type_display()}
+
+
标签
+ formatted_content = content.content.replace('\n\n', '
').replace('\n', '
')
+
+ # 动态生成预览页面
+ html_content = f"""
+
+
+
{formatted_content}
+