From e51154bb29bd9a3ff4868263aeb7379054386be4 Mon Sep 17 00:00:00 2001 From: yuangyaa Date: Tue, 23 Sep 2025 13:30:03 +0800 Subject: [PATCH] Base setup --- .gitignore | 2 +- crawler/__init__.py | 0 crawler/admin.py | 199 ++++++ crawler/apps.py | 6 + crawler/crawler_engine.py | 578 ++++++++++++++++++ crawler/management/__init__.py | 0 crawler/management/commands/__init__.py | 0 crawler/management/commands/init_websites.py | 36 ++ crawler/management/commands/run_crawler.py | 69 +++ crawler/migrations/0001_initial.py | 106 ++++ ..._crawledcontent_is_local_saved_and_more.py | 24 + crawler/migrations/0003_mediafile.py | 35 ++ crawler/migrations/__init__.py | 0 crawler/models.py | 195 ++++++ crawler/serializers.py | 51 ++ crawler/tasks.py | 36 ++ crawler/templates/crawler/base.html | 80 +++ crawler/templates/crawler/dashboard.html | 320 ++++++++++ crawler/templates/crawler/search.html | 128 ++++ crawler/templatetags/__init__.py | 0 crawler/templatetags/custom_filters.py | 32 + crawler/tests.py | 3 + crawler/urls.py | 8 + crawler/views.py | 292 +++++++++ crawler_project/__init__.py | 4 + crawler_project/asgi.py | 16 + crawler_project/celery.py | 17 + crawler_project/settings.py | 181 ++++++ crawler_project/urls.py | 29 + crawler_project/wsgi.py | 16 + manage.py | 22 + requirements.txt | 29 + start.sh | 49 ++ start_celery.sh | 12 + 34 files changed, 2574 insertions(+), 1 deletion(-) create mode 100644 crawler/__init__.py create mode 100644 crawler/admin.py create mode 100644 crawler/apps.py create mode 100644 crawler/crawler_engine.py create mode 100644 crawler/management/__init__.py create mode 100644 crawler/management/commands/__init__.py create mode 100644 crawler/management/commands/init_websites.py create mode 100644 crawler/management/commands/run_crawler.py create mode 100644 crawler/migrations/0001_initial.py create mode 100644 crawler/migrations/0002_crawledcontent_is_local_saved_and_more.py create mode 100644 crawler/migrations/0003_mediafile.py create mode 100644 crawler/migrations/__init__.py create mode 100644 crawler/models.py create mode 100644 crawler/serializers.py create mode 100644 crawler/tasks.py create mode 100644 crawler/templates/crawler/base.html create mode 100644 crawler/templates/crawler/dashboard.html create mode 100644 crawler/templates/crawler/search.html create mode 100644 crawler/templatetags/__init__.py create mode 100644 crawler/templatetags/custom_filters.py create mode 100644 crawler/tests.py create mode 100644 crawler/urls.py create mode 100644 crawler/views.py create mode 100644 crawler_project/__init__.py create mode 100644 crawler_project/asgi.py create mode 100644 crawler_project/celery.py create mode 100644 crawler_project/settings.py create mode 100644 crawler_project/urls.py create mode 100644 crawler_project/wsgi.py create mode 100755 manage.py create mode 100644 requirements.txt create mode 100755 start.sh create mode 100755 start_celery.sh diff --git a/.gitignore b/.gitignore index 36b13f1..575c1ad 100644 --- a/.gitignore +++ b/.gitignore @@ -166,7 +166,7 @@ cython_debug/ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ +.idea/ # Ruff stuff: .ruff_cache/ diff --git a/crawler/__init__.py b/crawler/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/crawler/admin.py b/crawler/admin.py new file mode 100644 index 0000000..b738c61 --- /dev/null +++ b/crawler/admin.py @@ -0,0 +1,199 @@ +from django.contrib import admin +from django.utils.html import format_html +from django.urls import reverse +from django.utils.safestring import mark_safe +from .models import Website, CrawlTask, CrawledContent, CrawlLog, SearchKeyword, MediaFile +from .tasks import crawl_websites_task + + +@admin.register(Website) +class WebsiteAdmin(admin.ModelAdmin): + list_display = ['name', 'region', 'url', 'is_active', 'created_at'] + list_filter = ['region', 'is_active', 'created_at'] + search_fields = ['name', 'url', 'region'] + list_editable = ['is_active'] + ordering = ['region', 'name'] + + +@admin.register(CrawlTask) +class CrawlTaskAdmin(admin.ModelAdmin): + list_display = ['name', 'status', 'created_by', 'progress_display', 'created_at', 'completed_at'] + list_filter = ['status', 'created_by', 'created_at'] + search_fields = ['name', 'keywords'] + readonly_fields = ['created_at', 'started_at', 'completed_at', 'progress_display'] + filter_horizontal = ['websites'] + actions = ['execute_crawl_task'] + + def progress_display(self, obj): + """显示任务进度""" + if obj.status == 'completed': + color = 'green' + elif obj.status == 'failed': + color = 'red' + elif obj.status == 'running': + color = 'orange' + else: + color = 'gray' + + return format_html( + '{}%', + color, + f'{obj.progress_percentage:.1f} ({obj.crawled_pages}/{obj.total_pages})' + ) + progress_display.short_description = '进度' + + def execute_crawl_task(self, request, queryset): + """执行选中的爬取任务""" + for task in queryset: + # 更新任务状态为pending + task.status = 'pending' + task.save() + + # 异步执行爬取任务 + crawl_websites_task.delay(task.id) + + self.message_user(request, f"已启动 {queryset.count()} 个爬取任务。") + execute_crawl_task.short_description = "执行选中的爬取任务" + + +@admin.register(CrawledContent) +class CrawledContentAdmin(admin.ModelAdmin): + list_display = ['title_short', 'website', 'task', 'keywords_matched', 'media_count', 'publish_date', 'is_local_saved', 'created_at'] + list_filter = ['website', 'task', 'created_at', 'publish_date', 'is_local_saved'] + search_fields = ['title', 'content', 'keywords_matched'] + readonly_fields = ['created_at', 'preview_content', 'media_files_display'] + ordering = ['-created_at'] + + def title_short(self, obj): + """显示缩短的标题""" + return obj.title[:50] + '...' if len(obj.title) > 50 else obj.title + title_short.short_description = '标题' + + def media_count(self, obj): + """显示媒体文件数量""" + count = obj.media_files.count() + if count > 0: + return format_html( + '{}', + count + ) + return "0" + media_count.short_description = '媒体文件' + + def preview_content(self, obj): + """预览内容""" + if obj.is_local_saved: + url = reverse('admin:crawled_content_preview', args=[obj.id]) + return format_html( + '预览文章', + url + ) + elif obj.content: + return format_html( + '
{}
', + obj.get_preview_content(500) + ) + else: + return "无内容" + preview_content.short_description = '内容预览' + + def media_files_display(self, obj): + """显示媒体文件列表""" + media_files = obj.media_files.all() + if not media_files: + return "无媒体文件" + + html = "
" + for media_file in media_files: + if media_file.media_type == 'image': + html += format_html( + '
' + '图片: {}
' + '
' + '大小: {}' + '
', + media_file.alt_text or '无标题', + media_file.local_file.name, + media_file.file_size_display + ) + elif media_file.media_type == 'video': + html += format_html( + '
' + '视频:
' + '
' + '大小: {}' + '
', + media_file.local_file.name, + media_file.mime_type, + media_file.file_size_display + ) + else: + html += format_html( + '
' + '{}: 下载
' + '大小: {}' + '
', + media_file.get_media_type_display(), + media_file.local_file.name, + media_file.file_size_display + ) + html += "
" + return format_html(html) + media_files_display.short_description = '媒体文件' + + +@admin.register(CrawlLog) +class CrawlLogAdmin(admin.ModelAdmin): + list_display = ['level', 'message_short', 'website', 'task', 'created_at'] + list_filter = ['level', 'website', 'task', 'created_at'] + search_fields = ['message'] + readonly_fields = ['created_at'] + ordering = ['-created_at'] + + def message_short(self, obj): + """显示缩短的消息""" + return obj.message[:100] + '...' if len(obj.message) > 100 else obj.message + message_short.short_description = '消息' + + +@admin.register(MediaFile) +class MediaFileAdmin(admin.ModelAdmin): + list_display = ['content', 'media_type', 'file_size_display', 'mime_type', 'created_at'] + list_filter = ['media_type', 'created_at'] + search_fields = ['content__title', 'original_url', 'alt_text'] + readonly_fields = ['created_at', 'file_size_display', 'media_preview'] + ordering = ['-created_at'] + + def media_preview(self, obj): + """媒体文件预览""" + if obj.media_type == 'image' and obj.local_file: + return format_html( + '', + obj.local_file.name + ) + elif obj.media_type == 'video' and obj.local_file: + return format_html( + '', + obj.local_file.name, + obj.mime_type + ) + elif obj.media_type == 'audio' and obj.local_file: + return format_html( + '', + obj.local_file.name, + obj.mime_type + ) + else: + return "无预览" + media_preview.short_description = '预览' + + +@admin.register(SearchKeyword) +class SearchKeywordAdmin(admin.ModelAdmin): + list_display = ['keyword', 'is_active', 'created_at', 'last_used'] + list_filter = ['is_active', 'created_at', 'last_used'] + search_fields = ['keyword'] + list_editable = ['is_active'] + ordering = ['-last_used', '-created_at'] \ No newline at end of file diff --git a/crawler/apps.py b/crawler/apps.py new file mode 100644 index 0000000..88478dd --- /dev/null +++ b/crawler/apps.py @@ -0,0 +1,6 @@ +from django.apps import AppConfig + + +class CrawlerConfig(AppConfig): + default_auto_field = 'django.db.models.BigAutoField' + name = 'crawler' diff --git a/crawler/crawler_engine.py b/crawler/crawler_engine.py new file mode 100644 index 0000000..4763ad8 --- /dev/null +++ b/crawler/crawler_engine.py @@ -0,0 +1,578 @@ +import requests +import time +import re +import logging +import os +import urllib3 +from bs4 import BeautifulSoup +from urllib.parse import urljoin, urlparse +from django.conf import settings +from django.utils import timezone +from django.core.files.base import ContentFile +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry +from .models import Website, CrawlTask, CrawledContent, CrawlLog, SearchKeyword, MediaFile + +# 禁用SSL警告 +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + +# 设置日志记录器 +logger = logging.getLogger(__name__) + + +class WebsiteCrawler: + """网站爬虫引擎""" + + def __init__(self, task_id): + self.task = CrawlTask.objects.get(id=task_id) + self.keywords = [kw.strip() for kw in self.task.keywords.split(',') if kw.strip()] + + # 创建带重试策略的会话 + self.session = requests.Session() + self.session.headers.update({ + 'User-Agent': settings.CRAWLER_SETTINGS['USER_AGENT'] + }) + + # 设置重试策略 + retry_strategy = Retry( + total=settings.CRAWLER_SETTINGS.get('MAX_RETRIES', 3), + backoff_factor=1, + status_forcelist=[429, 500, 502, 503, 504], + ) + adapter = HTTPAdapter(max_retries=retry_strategy) + self.session.mount("http://", adapter) + self.session.mount("https://", adapter) + + # 设置超时 + self.timeout = settings.CRAWLER_SETTINGS['TIMEOUT'] + + def log(self, level, message, website=None): + """记录日志""" + CrawlLog.objects.create( + task=self.task, + website=website, + level=level, + message=message + ) + # 同时记录到Python日志系统 + logger.log(getattr(logging, level.upper()), f"Task {self.task.id}: {message}") + + def update_task_status(self, status, **kwargs): + """更新任务状态""" + self.task.status = status + if status == 'running' and not self.task.started_at: + self.task.started_at = timezone.now() + elif status in ['completed', 'failed', 'cancelled']: + self.task.completed_at = timezone.now() + + for key, value in kwargs.items(): + setattr(self.task, key, value) + self.task.save() + + def extract_text_content(self, soup): + """提取文本内容,保持段落结构""" + # 移除脚本和样式标签 + for script in soup(["script", "style"]): + script.decompose() + + # 处理段落标签,保持段落结构 + paragraphs = [] + + # 查找所有段落相关的标签 + for element in soup.find_all(['p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'br']): + if element.name in ['p', 'div']: + text = element.get_text().strip() + if text: + paragraphs.append(text) + elif element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: + text = element.get_text().strip() + if text: + paragraphs.append(f"\n{text}\n") # 标题前后加换行 + elif element.name == 'br': + paragraphs.append('\n') + + # 如果没有找到段落标签,使用原来的方法 + if not paragraphs: + text = soup.get_text() + # 清理文本但保持换行 + lines = [] + for line in text.splitlines(): + line = line.strip() + if line: + lines.append(line) + return '\n\n'.join(lines) + + # 合并段落,用双换行分隔 + content = '\n\n'.join(paragraphs) + + # 清理多余的空行 + import re + content = re.sub(r'\n\s*\n\s*\n', '\n\n', content) + + return content.strip() + + def find_article_links(self, soup, base_url): + """查找文章链接""" + links = [] + + # 常见的文章链接选择器 + selectors = [ + 'a[href*="article"]', + 'a[href*="news"]', + 'a[href*="content"]', + 'a[href*="detail"]', + 'a[href*="view"]', + 'a[href*="show"]', + '.news-list a', + '.article-list a', + '.content-list a', + 'h3 a', + 'h4 a', + '.title a', + '.list-item a' + ] + + for selector in selectors: + elements = soup.select(selector) + for element in elements: + href = element.get('href') + if href: + full_url = urljoin(base_url, href) + title = element.get_text().strip() + if title and len(title) > 5: # 过滤掉太短的标题 + links.append({ + 'url': full_url, + 'title': title + }) + + return links + + def check_keyword_match(self, text, title): + """检查关键字匹配""" + matched_keywords = [] + text_lower = text.lower() + title_lower = title.lower() + + for keyword in self.keywords: + keyword_lower = keyword.lower() + if keyword_lower in text_lower or keyword_lower in title_lower: + matched_keywords.append(keyword) + + return matched_keywords + + def extract_article_content(self, url, soup): + """提取文章内容""" + # 尝试多种内容选择器 + content_selectors = [ + '.article-content', + '.content', + '.article-body', + '.news-content', + '.main-content', + '.post-content', + 'article', + '.detail-content', + '#content', + '.text' + ] + + content = "" + for selector in content_selectors: + element = soup.select_one(selector) + if element: + content = self.extract_text_content(element) + if len(content) > 100: # 确保内容足够长 + break + + # 如果没找到特定内容区域,使用整个页面 + if not content or len(content) < 100: + content = self.extract_text_content(soup) + + return content + + def extract_publish_date(self, soup): + """提取发布时间""" + date_selectors = [ + '.publish-time', + '.pub-time', + '.date', + '.time', + '.publish-date', + 'time[datetime]', + '.article-time', + '.news-time' + ] + + for selector in date_selectors: + element = soup.select_one(selector) + if element: + date_text = element.get_text().strip() + if element.get('datetime'): + date_text = element.get('datetime') + + # 尝试解析日期 + try: + from datetime import datetime + # 这里可以添加更复杂的日期解析逻辑 + # 暂时返回当前时间 + return timezone.now() + except: + continue + + return None + + def extract_author(self, soup): + """提取作者信息""" + author_selectors = [ + '.author', + '.writer', + '.publisher', + '.byline', + '.article-author', + '.news-author' + ] + + for selector in author_selectors: + element = soup.select_one(selector) + if element: + return element.get_text().strip() + + return "" + + def download_media_file(self, media_url, crawled_content, media_type='image', alt_text=''): + """下载媒体文件""" + try: + # 检查URL是否有效 + if not media_url or not media_url.startswith(('http://', 'https://')): + return None + + # 请求媒体文件 + response = self.session.get( + media_url, + timeout=self.timeout, + verify=False, + stream=False # 改为False以确保获取完整内容 + ) + response.raise_for_status() + + # 获取文件信息 + content_type = response.headers.get('content-type', '') + content_length = response.headers.get('content-length') + file_size = int(content_length) if content_length else len(response.content) + + # 确定文件扩展名 + file_extension = self.get_file_extension_from_url(media_url, content_type) + + # 生成文件名 + filename = f"media_{crawled_content.id}_{len(crawled_content.media_files.all())}{file_extension}" + + # 创建媒体文件对象 + media_file = MediaFile.objects.create( + content=crawled_content, + media_type=media_type, + original_url=media_url, + file_size=file_size, + mime_type=content_type, + alt_text=alt_text + ) + + # 保存文件 + media_file.local_file.save( + filename, + ContentFile(response.content), + save=True + ) + + self.log('info', f'媒体文件已下载: {filename} ({media_type})', crawled_content.website) + return media_file + + except Exception as e: + self.log('error', f'下载媒体文件失败 {media_url}: {str(e)}', crawled_content.website) + return None + + def get_file_extension_from_url(self, url, content_type): + """从URL或内容类型获取文件扩展名""" + # 从URL获取扩展名 + parsed_url = urlparse(url) + path = parsed_url.path + if '.' in path: + return os.path.splitext(path)[1] + + # 从内容类型获取扩展名 + content_type_map = { + 'image/jpeg': '.jpg', + 'image/jpg': '.jpg', + 'image/png': '.png', + 'image/gif': '.gif', + 'image/webp': '.webp', + 'image/svg+xml': '.svg', + 'video/mp4': '.mp4', + 'video/avi': '.avi', + 'video/mov': '.mov', + 'video/wmv': '.wmv', + 'video/flv': '.flv', + 'video/webm': '.webm', + 'audio/mp3': '.mp3', + 'audio/wav': '.wav', + 'audio/ogg': '.ogg', + 'application/pdf': '.pdf', + 'application/msword': '.doc', + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx', + } + + return content_type_map.get(content_type.lower(), '.bin') + + def extract_and_download_media(self, soup, crawled_content, base_url): + """提取并下载页面中的媒体文件""" + media_files = [] + + # 提取图片 + images = soup.find_all('img') + self.log('info', f'找到 {len(images)} 个图片标签', crawled_content.website) + + for img in images: + src = img.get('src') + if src: + # 处理相对URL + if src.startswith('//'): + src = 'https:' + src + elif src.startswith('/'): + src = urljoin(base_url, src) + elif not src.startswith(('http://', 'https://')): + src = urljoin(base_url, src) + + alt_text = img.get('alt', '') + self.log('info', f'尝试下载图片: {src}', crawled_content.website) + media_file = self.download_media_file(src, crawled_content, 'image', alt_text) + if media_file: + media_files.append(media_file) + self.log('info', f'成功下载图片: {media_file.local_file.name}', crawled_content.website) + + # 提取视频 + videos = soup.find_all(['video', 'source']) + for video in videos: + src = video.get('src') + if src: + # 处理相对URL + if src.startswith('//'): + src = 'https:' + src + elif src.startswith('/'): + src = urljoin(base_url, src) + elif not src.startswith(('http://', 'https://')): + src = urljoin(base_url, src) + + media_file = self.download_media_file(src, crawled_content, 'video') + if media_file: + media_files.append(media_file) + + # 提取音频 + audios = soup.find_all('audio') + for audio in audios: + src = audio.get('src') + if src: + # 处理相对URL + if src.startswith('//'): + src = 'https:' + src + elif src.startswith('/'): + src = urljoin(base_url, src) + elif not src.startswith(('http://', 'https://')): + src = urljoin(base_url, src) + + media_file = self.download_media_file(src, crawled_content, 'audio') + if media_file: + media_files.append(media_file) + + return media_files + + def mark_content_saved(self, crawled_content): + """标记内容已保存(内容已存储在数据库中)""" + try: + crawled_content.is_local_saved = True + crawled_content.save() + + media_count = crawled_content.media_files.count() + self.log('info', f'文章内容已保存到数据库 (包含 {media_count} 个媒体文件)', crawled_content.website) + return True + except Exception as e: + self.log('error', f'标记内容保存状态失败: {str(e)}', crawled_content.website) + return False + + def crawl_website(self, website): + """爬取单个网站""" + self.log('info', f'开始爬取网站: {website.name}', website) + + try: + # 请求主页 + response = self.session.get( + website.url, + timeout=self.timeout, + verify=False # 忽略SSL证书验证 + ) + response.raise_for_status() + + # 检查内容编码 + if response.encoding != 'utf-8': + # 尝试从响应头获取编码 + content_type = response.headers.get('content-type', '') + if 'charset=' in content_type: + charset = content_type.split('charset=')[-1] + response.encoding = charset + else: + response.encoding = 'utf-8' + + soup = BeautifulSoup(response.content, 'html.parser') + + # 查找文章链接 + article_links = self.find_article_links(soup, website.url) + self.log('info', f'找到 {len(article_links)} 个文章链接', website) + + crawled_count = 0 + for link_info in article_links: + try: + # 请求文章页面 + article_response = self.session.get( + link_info['url'], + timeout=self.timeout, + verify=False # 忽略SSL证书验证 + ) + article_response.raise_for_status() + + # 检查内容编码 + if article_response.encoding != 'utf-8': + # 尝试从响应头获取编码 + content_type = article_response.headers.get('content-type', '') + if 'charset=' in content_type: + charset = content_type.split('charset=')[-1] + article_response.encoding = charset + else: + article_response.encoding = 'utf-8' + + article_soup = BeautifulSoup(article_response.content, 'html.parser') + + # 提取内容 + content = self.extract_article_content(link_info['url'], article_soup) + title = link_info['title'] + + # 检查关键字匹配 + matched_keywords = self.check_keyword_match(content, title) + + if matched_keywords: + # 提取其他信息 + publish_date = self.extract_publish_date(article_soup) + author = self.extract_author(article_soup) + + # 保存内容 + crawled_content = CrawledContent.objects.create( + task=self.task, + website=website, + title=title, + content=content, + url=link_info['url'], + publish_date=publish_date, + author=author, + keywords_matched=','.join(matched_keywords), + is_local_saved=False # 初始设置为False,保存到本地后会更新为True + ) + + # 提取并下载媒体文件 + media_files = self.extract_and_download_media(article_soup, crawled_content, link_info['url']) + + # 标记内容已保存 + self.mark_content_saved(crawled_content) + + crawled_count += 1 + self.log('info', f'保存文章: {title[:50]}...', website) + + # 请求间隔 + time.sleep(settings.CRAWLER_SETTINGS['REQUEST_DELAY']) + + except requests.exceptions.SSLError as e: + self.log('error', f'SSL错误,跳过文章 {link_info["url"]}: {str(e)}', website) + continue + except requests.exceptions.ConnectionError as e: + self.log('error', f'连接错误,跳过文章 {link_info["url"]}: {str(e)}', website) + continue + except requests.exceptions.Timeout as e: + self.log('error', f'请求超时,跳过文章 {link_info["url"]}: {str(e)}', website) + continue + except requests.exceptions.RequestException as e: + self.log('error', f'网络请求错误,跳过文章 {link_info["url"]}: {str(e)}', website) + continue + except UnicodeDecodeError as e: + self.log('error', f'字符编码错误,跳过文章 {link_info["url"]}: {str(e)}', website) + continue + except Exception as e: + self.log('error', f'处理文章失败 {link_info["url"]}: {str(e)}', website) + continue + + self.log('info', f'网站爬取完成,共保存 {crawled_count} 篇文章', website) + return crawled_count + + except requests.exceptions.SSLError as e: + self.log('error', f'爬取网站SSL错误: {str(e)}', website) + return 0 + except requests.exceptions.ConnectionError as e: + self.log('error', f'爬取网站连接错误: {str(e)}', website) + return 0 + except requests.exceptions.Timeout as e: + self.log('error', f'爬取网站超时: {str(e)}', website) + return 0 + except requests.exceptions.RequestException as e: + self.log('error', f'爬取网站网络错误: {str(e)}', website) + return 0 + except Exception as e: + self.log('error', f'爬取网站失败: {str(e)}', website) + return 0 + + def run(self): + """运行爬取任务""" + self.log('info', f'开始执行爬取任务: {self.task.name}') + self.update_task_status('running') + + total_crawled = 0 + websites = self.task.websites.filter(is_active=True) + self.task.total_pages = websites.count() + self.task.save() + + for website in websites: + try: + crawled_count = self.crawl_website(website) + total_crawled += crawled_count + self.task.crawled_pages += 1 + self.task.save() + + except Exception as e: + self.log('error', f'爬取网站 {website.name} 时发生错误: {str(e)}', website) + continue + + # 更新任务状态 + if total_crawled > 0: + self.update_task_status('completed') + self.log('info', f'爬取任务完成,共爬取 {total_crawled} 篇文章') + else: + self.update_task_status('failed', error_message='没有找到匹配的内容') + self.log('error', '爬取任务失败,没有找到匹配的内容') + + +def run_crawl_task(task_id): + """运行爬取任务(Celery任务)""" + try: + crawler = WebsiteCrawler(task_id) + crawler.run() + return f"任务 {task_id} 执行完成" + except Exception as e: + # 记录异常到日志 + logger.error(f"执行任务 {task_id} 时发生异常: {str(e)}", exc_info=True) + + task = CrawlTask.objects.get(id=task_id) + task.status = 'failed' + task.error_message = str(e) + task.completed_at = timezone.now() + task.save() + + CrawlLog.objects.create( + task=task, + level='error', + message=f'任务执行失败: {str(e)}' + ) + return f"任务 {task_id} 执行失败: {str(e)}" \ No newline at end of file diff --git a/crawler/management/__init__.py b/crawler/management/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/crawler/management/commands/__init__.py b/crawler/management/commands/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/crawler/management/commands/init_websites.py b/crawler/management/commands/init_websites.py new file mode 100644 index 0000000..d0e01e1 --- /dev/null +++ b/crawler/management/commands/init_websites.py @@ -0,0 +1,36 @@ +from django.core.management.base import BaseCommand +from django.conf import settings +from crawler.models import Website + + +class Command(BaseCommand): + help = '初始化目标网站数据' + + def handle(self, *args, **options): + self.stdout.write('开始初始化目标网站数据...') + + # 清空现有数据 + Website.objects.all().delete() + + # 从设置中获取网站列表 + websites_data = settings.TARGET_WEBSITES + + created_count = 0 + for website_data in websites_data: + website, created = Website.objects.get_or_create( + url=website_data['url'], + defaults={ + 'name': website_data['name'], + 'region': website_data['region'], + 'is_active': True + } + ) + if created: + created_count += 1 + self.stdout.write(f'创建网站: {website.name}') + else: + self.stdout.write(f'网站已存在: {website.name}') + + self.stdout.write( + self.style.SUCCESS(f'成功初始化 {created_count} 个网站') + ) diff --git a/crawler/management/commands/run_crawler.py b/crawler/management/commands/run_crawler.py new file mode 100644 index 0000000..efeaa54 --- /dev/null +++ b/crawler/management/commands/run_crawler.py @@ -0,0 +1,69 @@ +from django.core.management.base import BaseCommand +from crawler.models import CrawlTask, Website +from crawler.tasks import crawl_websites_task + + +class Command(BaseCommand): + help = '运行爬虫任务' + + def add_arguments(self, parser): + parser.add_argument( + '--keywords', + type=str, + required=True, + help='搜索关键字,多个关键字用逗号分隔' + ) + parser.add_argument( + '--websites', + type=str, + help='网站ID列表,用逗号分隔。不指定则爬取所有网站' + ) + parser.add_argument( + '--name', + type=str, + help='任务名称' + ) + + def handle(self, *args, **options): + keywords = options['keywords'] + website_ids = options.get('websites') + task_name = options.get('name', f'关键字搜索: {keywords}') + + # 获取目标网站 + if website_ids: + website_id_list = [int(id.strip()) for id in website_ids.split(',')] + websites = Website.objects.filter(id__in=website_id_list, is_active=True) + else: + websites = Website.objects.filter(is_active=True) + + if not websites.exists(): + self.stdout.write( + self.style.ERROR('没有找到可用的网站') + ) + return + + # 创建任务 + task = CrawlTask.objects.create( + name=task_name, + keywords=keywords, + created_by='management_command' + ) + task.websites.set(websites) + + self.stdout.write(f'创建任务: {task.name}') + self.stdout.write(f'目标网站: {websites.count()} 个') + self.stdout.write(f'搜索关键字: {keywords}') + + # 启动任务(同步模式,如果没有Redis则直接运行) + try: + crawl_websites_task.delay(task.id) + self.stdout.write('任务已提交到队列') + except Exception as e: + self.stdout.write(f'队列不可用,直接运行任务: {e}') + from crawler.crawler_engine import WebsiteCrawler + crawler = WebsiteCrawler(task.id) + crawler.run() + + self.stdout.write( + self.style.SUCCESS(f'任务已启动,任务ID: {task.id}') + ) diff --git a/crawler/migrations/0001_initial.py b/crawler/migrations/0001_initial.py new file mode 100644 index 0000000..eba286f --- /dev/null +++ b/crawler/migrations/0001_initial.py @@ -0,0 +1,106 @@ +# Generated by Django 5.2.6 on 2025-09-22 16:27 + +import django.db.models.deletion +from django.db import migrations, models + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ] + + operations = [ + migrations.CreateModel( + name='SearchKeyword', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('keyword', models.CharField(max_length=100, unique=True, verbose_name='关键字')), + ('is_active', models.BooleanField(default=True, verbose_name='是否启用')), + ('created_at', models.DateTimeField(auto_now_add=True, verbose_name='创建时间')), + ('last_used', models.DateTimeField(blank=True, null=True, verbose_name='最后使用时间')), + ], + options={ + 'verbose_name': '搜索关键字', + 'verbose_name_plural': '搜索关键字', + 'ordering': ['-last_used', '-created_at'], + }, + ), + migrations.CreateModel( + name='Website', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('name', models.CharField(max_length=100, verbose_name='网站名称')), + ('url', models.URLField(verbose_name='网站地址')), + ('region', models.CharField(max_length=50, verbose_name='所属地区')), + ('is_active', models.BooleanField(default=True, verbose_name='是否启用')), + ('created_at', models.DateTimeField(auto_now_add=True, verbose_name='创建时间')), + ('updated_at', models.DateTimeField(auto_now=True, verbose_name='更新时间')), + ], + options={ + 'verbose_name': '目标网站', + 'verbose_name_plural': '目标网站', + 'ordering': ['region', 'name'], + }, + ), + migrations.CreateModel( + name='CrawlTask', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('name', models.CharField(max_length=200, verbose_name='任务名称')), + ('keywords', models.TextField(help_text='多个关键字用逗号分隔', verbose_name='搜索关键字')), + ('status', models.CharField(choices=[('pending', '待执行'), ('running', '执行中'), ('completed', '已完成'), ('failed', '执行失败'), ('cancelled', '已取消')], default='pending', max_length=20, verbose_name='任务状态')), + ('created_by', models.CharField(default='system', max_length=100, verbose_name='创建者')), + ('created_at', models.DateTimeField(auto_now_add=True, verbose_name='创建时间')), + ('started_at', models.DateTimeField(blank=True, null=True, verbose_name='开始时间')), + ('completed_at', models.DateTimeField(blank=True, null=True, verbose_name='完成时间')), + ('error_message', models.TextField(blank=True, verbose_name='错误信息')), + ('total_pages', models.IntegerField(default=0, verbose_name='总页数')), + ('crawled_pages', models.IntegerField(default=0, verbose_name='已爬取页数')), + ('websites', models.ManyToManyField(to='crawler.website', verbose_name='目标网站')), + ], + options={ + 'verbose_name': '爬取任务', + 'verbose_name_plural': '爬取任务', + 'ordering': ['-created_at'], + }, + ), + migrations.CreateModel( + name='CrawlLog', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('level', models.CharField(choices=[('info', '信息'), ('warning', '警告'), ('error', '错误'), ('debug', '调试')], max_length=20, verbose_name='日志级别')), + ('message', models.TextField(verbose_name='日志消息')), + ('created_at', models.DateTimeField(auto_now_add=True, verbose_name='记录时间')), + ('task', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='logs', to='crawler.crawltask', verbose_name='所属任务')), + ('website', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, to='crawler.website', verbose_name='相关网站')), + ], + options={ + 'verbose_name': '爬取日志', + 'verbose_name_plural': '爬取日志', + 'ordering': ['-created_at'], + }, + ), + migrations.CreateModel( + name='CrawledContent', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('title', models.CharField(max_length=500, verbose_name='标题')), + ('content', models.TextField(verbose_name='内容')), + ('url', models.URLField(verbose_name='原文链接')), + ('publish_date', models.DateTimeField(blank=True, null=True, verbose_name='发布时间')), + ('author', models.CharField(blank=True, max_length=100, verbose_name='作者')), + ('keywords_matched', models.TextField(help_text='匹配到的关键字,用逗号分隔', verbose_name='匹配的关键字')), + ('created_at', models.DateTimeField(auto_now_add=True, verbose_name='爬取时间')), + ('task', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='contents', to='crawler.crawltask', verbose_name='所属任务')), + ('website', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='crawler.website', verbose_name='来源网站')), + ], + options={ + 'verbose_name': '爬取内容', + 'verbose_name_plural': '爬取内容', + 'ordering': ['-created_at'], + 'indexes': [models.Index(fields=['task', 'website'], name='crawler_cra_task_id_6244e7_idx'), models.Index(fields=['created_at'], name='crawler_cra_created_a116d2_idx'), models.Index(fields=['publish_date'], name='crawler_cra_publish_5b8ccc_idx')], + }, + ), + ] diff --git a/crawler/migrations/0002_crawledcontent_is_local_saved_and_more.py b/crawler/migrations/0002_crawledcontent_is_local_saved_and_more.py new file mode 100644 index 0000000..dbe2d0a --- /dev/null +++ b/crawler/migrations/0002_crawledcontent_is_local_saved_and_more.py @@ -0,0 +1,24 @@ +# Generated by Django 5.2.6 on 2025-09-23 00:38 + +import crawler.models +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('crawler', '0001_initial'), + ] + + operations = [ + migrations.AddField( + model_name='crawledcontent', + name='is_local_saved', + field=models.BooleanField(default=False, verbose_name='是否已本地保存'), + ), + migrations.AddField( + model_name='crawledcontent', + name='local_file', + field=models.FileField(blank=True, null=True, upload_to=crawler.models.crawled_content_file_path, verbose_name='本地文件'), + ), + ] diff --git a/crawler/migrations/0003_mediafile.py b/crawler/migrations/0003_mediafile.py new file mode 100644 index 0000000..323b497 --- /dev/null +++ b/crawler/migrations/0003_mediafile.py @@ -0,0 +1,35 @@ +# Generated by Django 5.2.6 on 2025-09-23 01:05 + +import crawler.models +import django.db.models.deletion +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('crawler', '0002_crawledcontent_is_local_saved_and_more'), + ] + + operations = [ + migrations.CreateModel( + name='MediaFile', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('media_type', models.CharField(choices=[('image', '图片'), ('video', '视频'), ('audio', '音频'), ('document', '文档')], max_length=20, verbose_name='媒体类型')), + ('original_url', models.URLField(verbose_name='原始URL')), + ('local_file', models.FileField(upload_to=crawler.models.media_file_path, verbose_name='本地文件')), + ('file_size', models.BigIntegerField(blank=True, null=True, verbose_name='文件大小(字节)')), + ('mime_type', models.CharField(blank=True, max_length=100, verbose_name='MIME类型')), + ('alt_text', models.CharField(blank=True, max_length=500, verbose_name='替代文本')), + ('created_at', models.DateTimeField(auto_now_add=True, verbose_name='创建时间')), + ('content', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='media_files', to='crawler.crawledcontent', verbose_name='所属内容')), + ], + options={ + 'verbose_name': '媒体文件', + 'verbose_name_plural': '媒体文件', + 'ordering': ['-created_at'], + 'indexes': [models.Index(fields=['content', 'media_type'], name='crawler_med_content_3a9468_idx'), models.Index(fields=['created_at'], name='crawler_med_created_13ff00_idx')], + }, + ), + ] diff --git a/crawler/migrations/__init__.py b/crawler/migrations/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/crawler/models.py b/crawler/models.py new file mode 100644 index 0000000..bfa6ebb --- /dev/null +++ b/crawler/models.py @@ -0,0 +1,195 @@ +from django.db import models +from django.utils import timezone +import os + + +def crawled_content_file_path(instance, filename): + """生成爬取内容文件的存储路径""" + # 使用任务ID和时间戳创建唯一文件名 + timestamp = timezone.now().strftime('%Y%m%d_%H%M%S') + name, ext = os.path.splitext(filename) + return f'crawled_content/{instance.task.id}/{timestamp}_{instance.id}{ext}' + + +def media_file_path(instance, filename): + """生成媒体文件的存储路径""" + # 使用任务ID和内容ID创建媒体文件路径 + timestamp = timezone.now().strftime('%Y%m%d_%H%M%S') + name, ext = os.path.splitext(filename) + return f'{instance.content.task.id}/{instance.content.id}/{timestamp}_{name}{ext}' + + +class Website(models.Model): + """目标网站模型""" + name = models.CharField(max_length=100, verbose_name='网站名称') + url = models.URLField(verbose_name='网站地址') + region = models.CharField(max_length=50, verbose_name='所属地区') + is_active = models.BooleanField(default=True, verbose_name='是否启用') + created_at = models.DateTimeField(auto_now_add=True, verbose_name='创建时间') + updated_at = models.DateTimeField(auto_now=True, verbose_name='更新时间') + + class Meta: + verbose_name = '目标网站' + verbose_name_plural = '目标网站' + ordering = ['region', 'name'] + + def __str__(self): + return f"{self.region} - {self.name}" + + +class CrawlTask(models.Model): + """爬取任务模型""" + TASK_STATUS_CHOICES = [ + ('pending', '待执行'), + ('running', '执行中'), + ('completed', '已完成'), + ('failed', '执行失败'), + ('cancelled', '已取消'), + ] + + name = models.CharField(max_length=200, verbose_name='任务名称') + keywords = models.TextField(verbose_name='搜索关键字', help_text='多个关键字用逗号分隔') + websites = models.ManyToManyField(Website, verbose_name='目标网站') + status = models.CharField(max_length=20, choices=TASK_STATUS_CHOICES, default='pending', verbose_name='任务状态') + created_by = models.CharField(max_length=100, verbose_name='创建者', default='system') + created_at = models.DateTimeField(auto_now_add=True, verbose_name='创建时间') + started_at = models.DateTimeField(null=True, blank=True, verbose_name='开始时间') + completed_at = models.DateTimeField(null=True, blank=True, verbose_name='完成时间') + error_message = models.TextField(blank=True, verbose_name='错误信息') + total_pages = models.IntegerField(default=0, verbose_name='总页数') + crawled_pages = models.IntegerField(default=0, verbose_name='已爬取页数') + + class Meta: + verbose_name = '爬取任务' + verbose_name_plural = '爬取任务' + ordering = ['-created_at'] + + def __str__(self): + return f"{self.name} - {self.get_status_display()}" + + @property + def progress_percentage(self): + """计算任务进度百分比""" + if self.total_pages == 0: + return 0 + return round((self.crawled_pages / self.total_pages) * 100, 2) + + +class CrawledContent(models.Model): + """爬取内容模型""" + task = models.ForeignKey(CrawlTask, on_delete=models.CASCADE, related_name='contents', verbose_name='所属任务') + website = models.ForeignKey(Website, on_delete=models.CASCADE, verbose_name='来源网站') + title = models.CharField(max_length=500, verbose_name='标题') + content = models.TextField(verbose_name='内容') + url = models.URLField(verbose_name='原文链接') + publish_date = models.DateTimeField(null=True, blank=True, verbose_name='发布时间') + author = models.CharField(max_length=100, blank=True, verbose_name='作者') + keywords_matched = models.TextField(verbose_name='匹配的关键字', help_text='匹配到的关键字,用逗号分隔') + created_at = models.DateTimeField(auto_now_add=True, verbose_name='爬取时间') + + # 添加本地存储字段 + local_file = models.FileField(upload_to=crawled_content_file_path, blank=True, null=True, verbose_name='本地文件') + is_local_saved = models.BooleanField(default=False, verbose_name='是否已本地保存') + + class Meta: + verbose_name = '爬取内容' + verbose_name_plural = '爬取内容' + ordering = ['-created_at'] + indexes = [ + models.Index(fields=['task', 'website']), + models.Index(fields=['created_at']), + models.Index(fields=['publish_date']), + ] + + def __str__(self): + return f"{self.website.name} - {self.title[:50]}" + + def get_preview_content(self, max_length=500): + """获取预览内容""" + if len(self.content) <= max_length: + return self.content + return self.content[:max_length] + '...' + + +class MediaFile(models.Model): + """媒体文件模型""" + MEDIA_TYPE_CHOICES = [ + ('image', '图片'), + ('video', '视频'), + ('audio', '音频'), + ('document', '文档'), + ] + + content = models.ForeignKey(CrawledContent, on_delete=models.CASCADE, related_name='media_files', verbose_name='所属内容') + media_type = models.CharField(max_length=20, choices=MEDIA_TYPE_CHOICES, verbose_name='媒体类型') + original_url = models.URLField(verbose_name='原始URL') + local_file = models.FileField(upload_to=media_file_path, verbose_name='本地文件') + file_size = models.BigIntegerField(null=True, blank=True, verbose_name='文件大小(字节)') + mime_type = models.CharField(max_length=100, blank=True, verbose_name='MIME类型') + alt_text = models.CharField(max_length=500, blank=True, verbose_name='替代文本') + created_at = models.DateTimeField(auto_now_add=True, verbose_name='创建时间') + + class Meta: + verbose_name = '媒体文件' + verbose_name_plural = '媒体文件' + ordering = ['-created_at'] + indexes = [ + models.Index(fields=['content', 'media_type']), + models.Index(fields=['created_at']), + ] + + def __str__(self): + return f"{self.get_media_type_display()} - {self.original_url}" + + @property + def file_size_display(self): + """显示文件大小""" + if not self.file_size: + return "未知" + + size = self.file_size + for unit in ['B', 'KB', 'MB', 'GB']: + if size < 1024.0: + return f"{size:.1f} {unit}" + size /= 1024.0 + return f"{size:.1f} TB" + + +class CrawlLog(models.Model): + """爬取日志模型""" + LOG_LEVEL_CHOICES = [ + ('info', '信息'), + ('warning', '警告'), + ('error', '错误'), + ('debug', '调试'), + ] + + task = models.ForeignKey(CrawlTask, on_delete=models.CASCADE, related_name='logs', verbose_name='所属任务') + website = models.ForeignKey(Website, on_delete=models.CASCADE, null=True, blank=True, verbose_name='相关网站') + level = models.CharField(max_length=20, choices=LOG_LEVEL_CHOICES, verbose_name='日志级别') + message = models.TextField(verbose_name='日志消息') + created_at = models.DateTimeField(auto_now_add=True, verbose_name='记录时间') + + class Meta: + verbose_name = '爬取日志' + verbose_name_plural = '爬取日志' + ordering = ['-created_at'] + + def __str__(self): + return f"[{self.get_level_display()}] {self.message[:100]}" + + +class SearchKeyword(models.Model): + """搜索关键字模型""" + keyword = models.CharField(max_length=100, unique=True, verbose_name='关键字') + is_active = models.BooleanField(default=True, verbose_name='是否启用') + created_at = models.DateTimeField(auto_now_add=True, verbose_name='创建时间') + last_used = models.DateTimeField(null=True, blank=True, verbose_name='最后使用时间') + + class Meta: + verbose_name = '搜索关键字' + verbose_name_plural = '搜索关键字' + ordering = ['-last_used', '-created_at'] + + def __str__(self): + return self.keyword \ No newline at end of file diff --git a/crawler/serializers.py b/crawler/serializers.py new file mode 100644 index 0000000..54f9841 --- /dev/null +++ b/crawler/serializers.py @@ -0,0 +1,51 @@ +from rest_framework import serializers +from .models import Website, CrawlTask, CrawledContent, CrawlLog, SearchKeyword, MediaFile + + +class WebsiteSerializer(serializers.ModelSerializer): + class Meta: + model = Website + fields = '__all__' + + +class CrawlTaskSerializer(serializers.ModelSerializer): + websites = WebsiteSerializer(many=True, read_only=True) + progress_percentage = serializers.ReadOnlyField() + + class Meta: + model = CrawlTask + fields = '__all__' + + +class MediaFileSerializer(serializers.ModelSerializer): + file_size_display = serializers.ReadOnlyField() + + class Meta: + model = MediaFile + fields = '__all__' + + +class CrawledContentSerializer(serializers.ModelSerializer): + website_name = serializers.CharField(source='website.name', read_only=True) + website_region = serializers.CharField(source='website.region', read_only=True) + task_name = serializers.CharField(source='task.name', read_only=True) + media_files = MediaFileSerializer(many=True, read_only=True) + + class Meta: + model = CrawledContent + fields = '__all__' + + +class CrawlLogSerializer(serializers.ModelSerializer): + website_name = serializers.CharField(source='website.name', read_only=True) + task_name = serializers.CharField(source='task.name', read_only=True) + + class Meta: + model = CrawlLog + fields = '__all__' + + +class SearchKeywordSerializer(serializers.ModelSerializer): + class Meta: + model = SearchKeyword + fields = '__all__' diff --git a/crawler/tasks.py b/crawler/tasks.py new file mode 100644 index 0000000..c860eaa --- /dev/null +++ b/crawler/tasks.py @@ -0,0 +1,36 @@ +from celery import shared_task +from .crawler_engine import run_crawl_task as execute_crawl_task + + +@shared_task +def crawl_websites_task(task_id): + """爬取网站的Celery任务""" + return execute_crawl_task(task_id) + + +@shared_task +def run_crawl_task(task_id): + """执行爬取任务的Celery任务(为管理界面提供)""" + return execute_crawl_task(task_id) + + +@shared_task +def cleanup_old_tasks(): + """清理旧任务(保留最近30天的任务)""" + from django.utils import timezone + from datetime import timedelta + from .models import CrawlTask, CrawlLog, CrawledContent + + cutoff_date = timezone.now() - timedelta(days=30) + + # 删除30天前的任务及其相关数据 + old_tasks = CrawlTask.objects.filter(created_at__lt=cutoff_date) + count = old_tasks.count() + + for task in old_tasks: + # 删除相关的内容和日志 + CrawledContent.objects.filter(task=task).delete() + CrawlLog.objects.filter(task=task).delete() + task.delete() + + return f"清理了 {count} 个旧任务" \ No newline at end of file diff --git a/crawler/templates/crawler/base.html b/crawler/templates/crawler/base.html new file mode 100644 index 0000000..7f5406e --- /dev/null +++ b/crawler/templates/crawler/base.html @@ -0,0 +1,80 @@ + + + + + + {% block title %}网站爬虫系统{% endblock %} + + + + + + + +
+ {% block content %} + {% endblock %} +
+ + + + + {% block extra_js %} + {% endblock %} + + diff --git a/crawler/templates/crawler/dashboard.html b/crawler/templates/crawler/dashboard.html new file mode 100644 index 0000000..348059c --- /dev/null +++ b/crawler/templates/crawler/dashboard.html @@ -0,0 +1,320 @@ +{% extends 'crawler/base.html' %} +{% load custom_filters %} + +{% block title %}仪表板 - 网站爬虫系统{% endblock %} + +{% block content %} +
+
+

+ 系统仪表板 +

+
+
+ + +
+
+
+
+
+
+

{{ stats.total_websites }}

+

监控网站

+
+
+ +
+
+
+
+
+ +
+
+
+
+
+

{{ stats.total_tasks }}

+

爬取任务

+
+
+ +
+
+
+
+
+ +
+
+
+
+
+

{{ stats.total_contents }}

+

爬取内容

+
+
+ +
+
+
+
+
+ +
+
+
+
+
+

{{ stats.active_tasks }}

+

运行中任务

+
+
+ +
+
+
+
+
+
+ +
+ +
+
+
+
+
+
+ + + + + +
+
+
+ + {% if stats.page_obj.has_other_pages %} + + {% endif %} +
+
+
+
+
+ + +
+ {% for website_name, contents in stats.contents_by_website.items %} +
+
+
+ {{ website_name }} + {{ contents|length }} +
+
+
+
+ {% for content in contents %} +
+
+
+ {% if content.is_local_saved %} + + {{ content.title|truncatechars:60 }} + + {% else %} + + {{ content.title|truncatechars:60 }} + + {% endif %} +
+ {{ content.created_at|date:"m-d H:i" }} +
+

{{ content.content|truncatechars:100 }}

+
+ + {{ content.website.region }} + {% if content.media_files.count > 0 %} + | {{ content.media_files.count }} 个媒体文件 + {% endif %} + +
+ {% for keyword in content.keywords_matched|split:"," %} + {{ keyword|strip }} + {% endfor %} +
+
+
+ {% endfor %} +
+
+
+ {% empty %} +
+
+

暂无爬取内容

+
+
+ {% endfor %} + + + {% if stats.page_obj.has_other_pages %} +
+
+
+
+ 显示第 {{ stats.page_obj.start_index }} 到 {{ stats.page_obj.end_index }} 条,共 {{ stats.page_obj.paginator.count }} 条记录 +
+
+ + +
+
+
+
+ {% endif %} +
+ + +
+
+
+
+ 最近的任务 +
+
+
+ {% if stats.recent_tasks %} +
+ {% for task in stats.recent_tasks %} +
+
+
{{ task.name|truncatechars:30 }}
+ + {{ task.get_status_display }} + +
+

+ 关键字: {{ task.keywords|truncatechars:40 }} +

+ {{ task.created_at|date:"m-d H:i" }} +
+ {% endfor %} +
+ {% else %} +

暂无任务

+ {% endif %} +
+
+
+
+ + +
+
+
+
+
+ 快速操作 +
+
+ +
+
+
+{% endblock %} \ No newline at end of file diff --git a/crawler/templates/crawler/search.html b/crawler/templates/crawler/search.html new file mode 100644 index 0000000..ff74e05 --- /dev/null +++ b/crawler/templates/crawler/search.html @@ -0,0 +1,128 @@ +{% extends 'crawler/base.html' %} +{% load custom_filters %} + +{% block title %}搜索内容 - 网站爬虫系统{% endblock %} + +{% block content %} +
+
+

+ 内容搜索 +

+
+
+ + +
+
+
+
+
+
+ + +
+
+
+
+
+
+ + +{% if keyword %} +
+
+
+
+
+ 搜索结果 + {% if contents %} + {{ contents|length }} 条结果 + {% endif %} +
+
+
+ {% if contents %} +
+ {% for content in contents %} +
+
+
+ {% if content.is_local_saved %} + + {{ content.title }} + + {% else %} + + {{ content.title }} + + {% endif %} +
+ {{ content.created_at|date:"Y-m-d H:i" }} +
+

{{ content.content|truncatechars:200 }}

+
+ + {{ content.website.region }} - {{ content.website.name }} + {% if content.author %} + | {{ content.author }} + {% endif %} + {% if content.publish_date %} + | {{ content.publish_date|date:"Y-m-d" }} + {% endif %} + {% if content.media_files.count > 0 %} + | {{ content.media_files.count }} 个媒体文件 + {% endif %} + +
+ {% for keyword in content.keywords_matched|split:"," %} + {{ keyword|strip }} + {% endfor %} +
+
+
+ {% endfor %} +
+ {% else %} +
+ +

没有找到包含 "{{ keyword }}" 的内容

+

请尝试其他关键字或检查爬取任务是否正常运行

+
+ {% endif %} +
+
+
+
+{% else %} + +
+
+
+
+ +

开始搜索

+

在上方输入框中输入关键字,搜索已爬取的内容

+
+
搜索建议:
+
+ 反腐败 + 纪律检查 + 监督 + 廉政 + 违纪 +
+
+
+
+
+
+{% endif %} +{% endblock %} \ No newline at end of file diff --git a/crawler/templatetags/__init__.py b/crawler/templatetags/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/crawler/templatetags/custom_filters.py b/crawler/templatetags/custom_filters.py new file mode 100644 index 0000000..9313904 --- /dev/null +++ b/crawler/templatetags/custom_filters.py @@ -0,0 +1,32 @@ +from django import template + +register = template.Library() + + +@register.filter +def split(value, separator=','): + """Split a string by separator""" + if not value: + return [] + return value.split(separator) + + +@register.filter +def strip(value): + """Strip whitespace from a string""" + if not value: + return '' + return value.strip() + + +@register.filter +def div(value, divisor): + """Divide value by divisor""" + try: + value = float(value) + divisor = float(divisor) + if divisor == 0: + return 0 + return value / divisor + except (ValueError, TypeError): + return 0 \ No newline at end of file diff --git a/crawler/tests.py b/crawler/tests.py new file mode 100644 index 0000000..7ce503c --- /dev/null +++ b/crawler/tests.py @@ -0,0 +1,3 @@ +from django.test import TestCase + +# Create your tests here. diff --git a/crawler/urls.py b/crawler/urls.py new file mode 100644 index 0000000..0c34da0 --- /dev/null +++ b/crawler/urls.py @@ -0,0 +1,8 @@ +from django.urls import path, include +from . import views + +urlpatterns = [ + path('', views.dashboard, name='dashboard'), + path('search/', views.search_page, name='search'), + path('crawled-content//preview/', views.preview_crawled_content, name='preview_crawled_content'), +] \ No newline at end of file diff --git a/crawler/views.py b/crawler/views.py new file mode 100644 index 0000000..70312f3 --- /dev/null +++ b/crawler/views.py @@ -0,0 +1,292 @@ +from django.shortcuts import render, get_object_or_404 +from django.http import HttpResponse +from django.db.models import Q, Count +from django.conf import settings +from django.utils import timezone +from .models import Website, CrawlTask, CrawledContent, CrawlLog, SearchKeyword +from rest_framework import viewsets, filters +from rest_framework.decorators import action +from rest_framework.response import Response +from .serializers import ( + WebsiteSerializer, CrawlTaskSerializer, CrawledContentSerializer, + CrawlLogSerializer, SearchKeywordSerializer +) +import json +from django.core.paginator import Paginator +from django.db.models.functions import TruncDate +from django.db.models import Count + + +def dashboard(request): + """仪表板视图""" + # 统计数据 + total_websites = Website.objects.filter(is_active=True).count() + total_tasks = CrawlTask.objects.count() + total_contents = CrawledContent.objects.count() + active_tasks = CrawlTask.objects.filter(status='running').count() + + # 获取所有网站 + websites = Website.objects.filter(is_active=True).order_by('name') + + # 获取当前选中的网站ID + selected_website_id = request.GET.get('website') + + # 获取分页参数 + page_number = request.GET.get('page', 1) + page_size = request.GET.get('page_size', 20) # 默认每页20篇文章 + + # 尝试转换page_size为整数 + try: + page_size = int(page_size) + # 限制page_size在合理范围内 + page_size = max(10, min(100, page_size)) + except (ValueError, TypeError): + page_size = 20 + + # 获取所有爬取的内容,按网站和创建时间排序 + all_contents = CrawledContent.objects.select_related('website').order_by('website__name', '-created_at') + + # 如果选择了特定网站,则进行过滤 + if selected_website_id: + try: + selected_website_id = int(selected_website_id) + all_contents = all_contents.filter(website_id=selected_website_id) + except (ValueError, TypeError): + pass + + # 分页处理 + paginator = Paginator(all_contents, page_size) + page_obj = paginator.get_page(page_number) + + # 按网站分组内容 + contents_by_website = {} + for content in page_obj: + website_name = content.website.name + if website_name not in contents_by_website: + contents_by_website[website_name] = [] + contents_by_website[website_name].append(content) + + # 最近的任务 + recent_tasks = CrawlTask.objects.order_by('-created_at')[:5] + + # 媒体文件统计 + total_media_files = CrawledContent.objects.aggregate( + total_media=Count('media_files') + )['total_media'] or 0 + + stats = { + 'total_websites': total_websites, + 'total_tasks': total_tasks, + 'total_contents': total_contents, + 'active_tasks': active_tasks, + 'websites': websites, + 'selected_website_id': selected_website_id, + 'page_obj': page_obj, + 'contents_by_website': contents_by_website, + 'page_size': page_size, + 'recent_tasks': recent_tasks, + 'total_media_files': total_media_files, + } + + return render(request, 'crawler/dashboard.html', {'stats': stats}) + + +def search_page(request): + """搜索页面视图""" + keyword = request.GET.get('q', '').strip() + contents = [] + + if keyword: + # 记录搜索关键字 + SearchKeyword.objects.get_or_create( + keyword=keyword, + defaults={'last_used': timezone.now()} + ) + + # 搜索内容 + contents = CrawledContent.objects.filter( + Q(title__icontains=keyword) | + Q(content__icontains=keyword) | + Q(keywords_matched__icontains=keyword) + ).order_by('-created_at')[:50] + + return render(request, 'crawler/search.html', { + 'keyword': keyword, + 'contents': contents + }) + + +def preview_crawled_content(request, content_id): + """预览爬取的内容""" + content = get_object_or_404(CrawledContent, id=content_id) + + # 获取媒体文件 + media_files = content.media_files.all() + + # 生成媒体文件HTML + media_section = "" + if media_files: + media_section = """ +
+

媒体文件

+""" + for media_file in media_files: + if media_file.media_type == 'image': + media_section += f""" +
+

图片: {media_file.alt_text or '无标题'}

+ {media_file.alt_text} +

原始URL: {media_file.original_url}

+

文件大小: {media_file.file_size_display}

+
+""" + elif media_file.media_type == 'video': + media_section += f""" +
+

视频

+ +

原始URL: {media_file.original_url}

+

文件大小: {media_file.file_size_display}

+
+""" + elif media_file.media_type == 'audio': + media_section += f""" +
+

音频

+ +

原始URL: {media_file.original_url}

+

文件大小: {media_file.file_size_display}

+
+""" + else: + media_section += f""" +
+

文件: {media_file.get_media_type_display()}

+

下载文件

+

原始URL: {media_file.original_url}

+

文件大小: {media_file.file_size_display}

+
+""" + media_section += "
" + + # 处理内容格式,将换行符转换为段落和
标签 + formatted_content = content.content.replace('\n\n', '

').replace('\n', '
') + + # 动态生成预览页面 + html_content = f""" + + + + + {content.title} + + + +

+ +

{content.title}

+ +
+

来源网站: {content.website.name} ({content.website.region})

+

原始链接: {content.url}

+

发布时间: {content.publish_date or '未知'}

+

作者: {content.author or '未知'}

+

匹配关键字: {content.keywords_matched}

+

爬取时间: {content.created_at}

+

媒体文件数量: {len(media_files)}

+
+ +
+

{formatted_content}

+
+ +{media_section} + + + """ + return HttpResponse(html_content, content_type='text/html; charset=utf-8') \ No newline at end of file diff --git a/crawler_project/__init__.py b/crawler_project/__init__.py new file mode 100644 index 0000000..39fa216 --- /dev/null +++ b/crawler_project/__init__.py @@ -0,0 +1,4 @@ +# 这将确保Celery应用在Django启动时被加载 +from .celery import app as celery_app + +__all__ = ('celery_app',) diff --git a/crawler_project/asgi.py b/crawler_project/asgi.py new file mode 100644 index 0000000..0d2a373 --- /dev/null +++ b/crawler_project/asgi.py @@ -0,0 +1,16 @@ +""" +ASGI config for crawler_project project. + +It exposes the ASGI callable as a module-level variable named ``application``. + +For more information on this file, see +https://docs.djangoproject.com/en/5.2/howto/deployment/asgi/ +""" + +import os + +from django.core.asgi import get_asgi_application + +os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'crawler_project.settings') + +application = get_asgi_application() diff --git a/crawler_project/celery.py b/crawler_project/celery.py new file mode 100644 index 0000000..a275916 --- /dev/null +++ b/crawler_project/celery.py @@ -0,0 +1,17 @@ +import os +from celery import Celery + +# 设置Django设置模块 +os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'crawler_project.settings') + +app = Celery('crawler_project') + +# 使用Django设置文件配置Celery +app.config_from_object('django.conf:settings', namespace='CELERY') + +# 自动发现任务 +app.autodiscover_tasks() + +@app.task(bind=True) +def debug_task(self): + print(f'Request: {self.request!r}') diff --git a/crawler_project/settings.py b/crawler_project/settings.py new file mode 100644 index 0000000..b22e738 --- /dev/null +++ b/crawler_project/settings.py @@ -0,0 +1,181 @@ +""" +Django settings for crawler_project project. + +Generated by 'django-admin startproject' using Django 5.2.6. + +For more information on this file, see +https://docs.djangoproject.com/en/5.2/topics/settings/ + +For the full list of settings and their values, see +https://docs.djangoproject.com/en/5.2/ref/settings/ +""" + +from pathlib import Path + +# Build paths inside the project like this: BASE_DIR / 'subdir'. +BASE_DIR = Path(__file__).resolve().parent.parent + + +# Quick-start development settings - unsuitable for production +# See https://docs.djangoproject.com/en/5.2/howto/deployment/checklist/ + +# SECURITY WARNING: keep the secret key used in production secret! +SECRET_KEY = 'django-insecure-w5lm159dl-)=z!dysfxf8!n^o26^6)4^!@5(yp*5-_c=!_tcq!' + +# SECURITY WARNING: don't run with debug turned on in production! +DEBUG = True + +ALLOWED_HOSTS = [] + + +# Application definition + +INSTALLED_APPS = [ + 'django.contrib.admin', + 'django.contrib.auth', + 'django.contrib.contenttypes', + 'django.contrib.sessions', + 'django.contrib.messages', + 'django.contrib.staticfiles', + 'crawler', + 'rest_framework', +] + +MIDDLEWARE = [ + 'django.middleware.security.SecurityMiddleware', + 'django.contrib.sessions.middleware.SessionMiddleware', + 'django.middleware.common.CommonMiddleware', + 'django.middleware.csrf.CsrfViewMiddleware', + 'django.contrib.auth.middleware.AuthenticationMiddleware', + 'django.contrib.messages.middleware.MessageMiddleware', + 'django.middleware.clickjacking.XFrameOptionsMiddleware', +] + +ROOT_URLCONF = 'crawler_project.urls' + +TEMPLATES = [ + { + 'BACKEND': 'django.template.backends.django.DjangoTemplates', + 'DIRS': [], + 'APP_DIRS': True, + 'OPTIONS': { + 'context_processors': [ + 'django.template.context_processors.request', + 'django.contrib.auth.context_processors.auth', + 'django.contrib.messages.context_processors.messages', + ], + }, + }, +] + +WSGI_APPLICATION = 'crawler_project.wsgi.application' + + +# Database +# https://docs.djangoproject.com/en/5.2/ref/settings/#databases + +DATABASES = { + 'default': { + 'ENGINE': 'django.db.backends.sqlite3', + 'NAME': BASE_DIR / 'db.sqlite3', + } +} + + +# Password validation +# https://docs.djangoproject.com/en/5.2/ref/settings/#auth-password-validators + +AUTH_PASSWORD_VALIDATORS = [ + { + 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator', + }, +] + + +# Internationalization +# https://docs.djangoproject.com/en/5.2/topics/i18n/ + +LANGUAGE_CODE = 'zh-hans' + +TIME_ZONE = 'Asia/Shanghai' + +USE_I18N = True + +USE_TZ = True + + +# Static files (CSS, JavaScript, Images) +# https://docs.djangoproject.com/en/5.2/howto/static-files/ + +STATIC_URL = 'static/' + +# Media files (用户上传的文件) +MEDIA_URL = '/media/' +MEDIA_ROOT = BASE_DIR / 'media' + +# Default primary key field type +# https://docs.djangoproject.com/en/5.2/ref/settings/#default-auto-field + +DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField' + +# Celery配置 +CELERY_BROKER_URL = 'redis://localhost:6379/0' +CELERY_RESULT_BACKEND = 'redis://localhost:6379/0' +CELERY_ACCEPT_CONTENT = ['json'] +CELERY_TASK_SERIALIZER = 'json' +CELERY_RESULT_SERIALIZER = 'json' +CELERY_TIMEZONE = TIME_ZONE + +# 爬虫配置 +CRAWLER_SETTINGS = { + 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', + 'REQUEST_DELAY': 1, # 请求间隔(秒) + 'TIMEOUT': 30, # 请求超时时间 + 'MAX_RETRIES': 3, # 最大重试次数 +} + +# 目标网站列表 +TARGET_WEBSITES = [ + {'name': '中共中央纪委', 'url': 'https://www.ccdi.gov.cn/', 'region': '中央'}, + {'name': '北京纪检监察', 'url': 'https://www.bjsupervision.gov.cn/', 'region': '北京'}, + {'name': '天津纪检监察', 'url': 'https://www.tjjw.gov.cn/', 'region': '天津'}, + {'name': '河北纪检监察', 'url': 'http://www.hebcdi.gov.cn/', 'region': '河北'}, + {'name': '山西纪检监察', 'url': 'http://www.sxdi.gov.cn/', 'region': '山西'}, + {'name': '内蒙古纪检监察', 'url': 'https://www.nmgjjjc.gov.cn/', 'region': '内蒙古'}, + {'name': '辽宁纪检监察', 'url': 'https://www.lnsjjjc.gov.cn/', 'region': '辽宁'}, + {'name': '吉林纪检监察', 'url': 'http://ccdijl.gov.cn/', 'region': '吉林'}, + {'name': '黑龙江纪检监察', 'url': 'https://www.hljjjjc.gov.cn/Hljjjjc/', 'region': '黑龙江'}, + {'name': '上海纪检监察', 'url': 'https://www.shjjjc.gov.cn/', 'region': '上海'}, + {'name': '江苏纪检监察', 'url': 'https://www.jssjw.gov.cn/', 'region': '江苏'}, + {'name': '浙江纪检监察', 'url': 'https://www.zjsjw.gov.cn/shouye/', 'region': '浙江'}, + {'name': '安徽纪检监察', 'url': 'http://www.ahjjjc.gov.cn/', 'region': '安徽'}, + {'name': '福建纪检监察', 'url': 'https://www.fjcdi.gov.cn/cms/html/fjsjwjw/index.html', 'region': '福建'}, + {'name': '江西纪检监察', 'url': 'http://www.jxdi.gov.cn/', 'region': '江西'}, + {'name': '山东纪检监察', 'url': 'https://www.sdjj.gov.cn/', 'region': '山东'}, + {'name': '河南纪检监察', 'url': 'https://www.hnsjw.gov.cn/sitesources/hnsjct/page_pc/index.html', 'region': '河南'}, + {'name': '湖北纪检监察', 'url': 'https://www.hbjwjc.gov.cn/', 'region': '湖北'}, + {'name': '湖南纪检监察', 'url': 'https://www.sxfj.gov.cn/', 'region': '湖南'}, + {'name': '广东纪检监察', 'url': 'https://www.gdjct.gd.gov.cn/', 'region': '广东'}, + {'name': '广西纪检监察', 'url': 'https://www.gxjjw.gov.cn/index.shtml', 'region': '广西'}, + {'name': '海南纪检监察', 'url': 'https://www.hncdi.gov.cn/web/hnlzw/v2/html/index.jsp', 'region': '海南'}, + {'name': '重庆纪检监察', 'url': 'https://jjc.cq.gov.cn/', 'region': '重庆'}, + {'name': '四川纪检监察', 'url': 'https://www.scjc.gov.cn/', 'region': '四川'}, + {'name': '贵州纪检监察', 'url': 'http://www.gzdis.gov.cn/', 'region': '贵州'}, + {'name': '云南纪检监察', 'url': 'http://www.ynjjjc.gov.cn/', 'region': '云南'}, + {'name': '西藏纪检监察', 'url': 'http://www.xzjjw.gov.cn/', 'region': '西藏'}, + {'name': '陕西纪检监察', 'url': 'https://www.qinfeng.gov.cn/', 'region': '陕西'}, + {'name': '甘肃纪检监察', 'url': 'http://www.gsjw.gov.cn/', 'region': '甘肃'}, + {'name': '青海纪检监察', 'url': 'http://www.nxjjjc.gov.cn/', 'region': '青海'}, + {'name': '宁夏纪检监察', 'url': 'http://www.qhjc.gov.cn/', 'region': '宁夏'}, + {'name': '新疆纪检监察', 'url': 'https://www.xjjw.gov.cn/', 'region': '新疆'}, + {'name': '新疆兵团纪检监察', 'url': 'http://btjw.xjbt.gov.cn/', 'region': '新疆兵团'}, +] diff --git a/crawler_project/urls.py b/crawler_project/urls.py new file mode 100644 index 0000000..2270154 --- /dev/null +++ b/crawler_project/urls.py @@ -0,0 +1,29 @@ +""" +URL configuration for crawler_project project. + +The `urlpatterns` list routes URLs to views. For more information please see: + https://docs.djangoproject.com/en/5.2/topics/http/urls/ +Examples: +Function views + 1. Add an import: from my_app import views + 2. Add a URL to urlpatterns: path('', views.home, name='home') +Class-based views + 1. Add an import: from other_app.views import Home + 2. Add a URL to urlpatterns: path('', Home.as_view(), name='home') +Including another URLconf + 1. Import the include() function: from django.urls import include, path + 2. Add a URL to urlpatterns: path('blog/', include('blog.urls')) +""" +from django.contrib import admin +from django.urls import path, include +from django.conf import settings +from django.conf.urls.static import static + +urlpatterns = [ + path('admin/', admin.site.urls), + path('', include('crawler.urls')), +] + +if settings.DEBUG: + urlpatterns += static(settings.STATIC_URL, document_root=settings.STATIC_ROOT) + urlpatterns += static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT) diff --git a/crawler_project/wsgi.py b/crawler_project/wsgi.py new file mode 100644 index 0000000..629739b --- /dev/null +++ b/crawler_project/wsgi.py @@ -0,0 +1,16 @@ +""" +WSGI config for crawler_project project. + +It exposes the WSGI callable as a module-level variable named ``application``. + +For more information on this file, see +https://docs.djangoproject.com/en/5.2/howto/deployment/wsgi/ +""" + +import os + +from django.core.wsgi import get_wsgi_application + +os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'crawler_project.settings') + +application = get_wsgi_application() diff --git a/manage.py b/manage.py new file mode 100755 index 0000000..c045e87 --- /dev/null +++ b/manage.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python +"""Django's command-line utility for administrative tasks.""" +import os +import sys + + +def main(): + """Run administrative tasks.""" + os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'crawler_project.settings') + try: + from django.core.management import execute_from_command_line + except ImportError as exc: + raise ImportError( + "Couldn't import Django. Are you sure it's installed and " + "available on your PYTHONPATH environment variable? Did you " + "forget to activate a virtual environment?" + ) from exc + execute_from_command_line(sys.argv) + + +if __name__ == '__main__': + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..0a88a0d --- /dev/null +++ b/requirements.txt @@ -0,0 +1,29 @@ +amqp==5.3.1 +asgiref==3.9.1 +beautifulsoup4==4.13.5 +billiard==4.2.2 +celery==5.5.3 +certifi==2025.8.3 +charset-normalizer==3.4.3 +click==8.3.0 +click-didyoumean==0.3.1 +click-plugins==1.1.1.2 +click-repl==0.3.0 +Django==5.2.6 +djangorestframework==3.15.2 +idna==3.10 +kombu==5.5.4 +lxml==6.0.2 +packaging==25.0 +prompt_toolkit==3.0.52 +python-dateutil==2.9.0.post0 +redis==6.4.0 +requests==2.32.5 +six==1.17.0 +soupsieve==2.8 +sqlparse==0.5.3 +typing_extensions==4.15.0 +tzdata==2025.2 +urllib3==2.5.0 +vine==5.1.0 +wcwidth==0.2.14 diff --git a/start.sh b/start.sh new file mode 100755 index 0000000..30e5b9d --- /dev/null +++ b/start.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +echo "启动网站爬虫系统..." + +# 检查是否在正确的目录 +if [ ! -f "manage.py" ]; then + echo "错误: 请在项目根目录运行此脚本" + exit 1 +fi + +# 检查Python环境 +if ! command -v python3 &> /dev/null; then + echo "错误: 未找到Python3" + exit 1 +fi + +# 安装依赖 +echo "安装依赖..." +pip install -r requirements.txt + +# 数据库迁移 +echo "执行数据库迁移..." +python3 manage.py makemigrations +python3 manage.py migrate + +# 初始化网站数据 +echo "初始化网站数据..." +python3 manage.py init_websites + +# 创建超级用户(如果不存在) +echo "检查超级用户..." +python3 manage.py shell -c " +from django.contrib.auth import get_user_model +User = get_user_model() +if not User.objects.filter(username='admin').exists(): + User.objects.create_superuser('admin', 'admin@example.com', 'admin123') + print('创建超级用户: admin/admin123') +else: + print('超级用户已存在') +" + +echo "启动Django服务器..." +echo "访问地址: http://localhost:8000" +echo "管理后台: http://localhost:8000/admin" +echo "用户名: admin, 密码: admin123" +echo "" +echo "按 Ctrl+C 停止服务器" + +python3 manage.py runserver 0.0.0.0:8000 diff --git a/start_celery.sh b/start_celery.sh new file mode 100755 index 0000000..d860906 --- /dev/null +++ b/start_celery.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +# 启动Celery Worker +echo "启动Celery Worker..." +celery -A crawler_project worker --loglevel=info --concurrency=4 & + +# 启动Celery Beat (定时任务) +echo "启动Celery Beat..." +celery -A crawler_project beat --loglevel=info & + +echo "Celery服务已启动" +echo "Worker PID: $!"