Base setup

This commit is contained in:
2025-09-23 13:30:03 +08:00
parent 1057ed8690
commit e51154bb29
34 changed files with 2574 additions and 1 deletions

2
.gitignore vendored
View File

@@ -166,7 +166,7 @@ cython_debug/
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
.idea/
# Ruff stuff:
.ruff_cache/

0
crawler/__init__.py Normal file
View File

199
crawler/admin.py Normal file
View File

@@ -0,0 +1,199 @@
from django.contrib import admin
from django.utils.html import format_html
from django.urls import reverse
from django.utils.safestring import mark_safe
from .models import Website, CrawlTask, CrawledContent, CrawlLog, SearchKeyword, MediaFile
from .tasks import crawl_websites_task
@admin.register(Website)
class WebsiteAdmin(admin.ModelAdmin):
list_display = ['name', 'region', 'url', 'is_active', 'created_at']
list_filter = ['region', 'is_active', 'created_at']
search_fields = ['name', 'url', 'region']
list_editable = ['is_active']
ordering = ['region', 'name']
@admin.register(CrawlTask)
class CrawlTaskAdmin(admin.ModelAdmin):
list_display = ['name', 'status', 'created_by', 'progress_display', 'created_at', 'completed_at']
list_filter = ['status', 'created_by', 'created_at']
search_fields = ['name', 'keywords']
readonly_fields = ['created_at', 'started_at', 'completed_at', 'progress_display']
filter_horizontal = ['websites']
actions = ['execute_crawl_task']
def progress_display(self, obj):
"""显示任务进度"""
if obj.status == 'completed':
color = 'green'
elif obj.status == 'failed':
color = 'red'
elif obj.status == 'running':
color = 'orange'
else:
color = 'gray'
return format_html(
'<span style="color: {};">{}%</span>',
color,
f'{obj.progress_percentage:.1f} ({obj.crawled_pages}/{obj.total_pages})'
)
progress_display.short_description = '进度'
def execute_crawl_task(self, request, queryset):
"""执行选中的爬取任务"""
for task in queryset:
# 更新任务状态为pending
task.status = 'pending'
task.save()
# 异步执行爬取任务
crawl_websites_task.delay(task.id)
self.message_user(request, f"已启动 {queryset.count()} 个爬取任务。")
execute_crawl_task.short_description = "执行选中的爬取任务"
@admin.register(CrawledContent)
class CrawledContentAdmin(admin.ModelAdmin):
list_display = ['title_short', 'website', 'task', 'keywords_matched', 'media_count', 'publish_date', 'is_local_saved', 'created_at']
list_filter = ['website', 'task', 'created_at', 'publish_date', 'is_local_saved']
search_fields = ['title', 'content', 'keywords_matched']
readonly_fields = ['created_at', 'preview_content', 'media_files_display']
ordering = ['-created_at']
def title_short(self, obj):
"""显示缩短的标题"""
return obj.title[:50] + '...' if len(obj.title) > 50 else obj.title
title_short.short_description = '标题'
def media_count(self, obj):
"""显示媒体文件数量"""
count = obj.media_files.count()
if count > 0:
return format_html(
'<span style="color: green; font-weight: bold;">{}</span>',
count
)
return "0"
media_count.short_description = '媒体文件'
def preview_content(self, obj):
"""预览内容"""
if obj.is_local_saved:
url = reverse('admin:crawled_content_preview', args=[obj.id])
return format_html(
'<a href="{}" target="_blank" class="button">预览文章</a>',
url
)
elif obj.content:
return format_html(
'<div style="max-height: 200px; overflow-y: auto; border: 1px solid #ddd; padding: 10px;">{}</div>',
obj.get_preview_content(500)
)
else:
return "无内容"
preview_content.short_description = '内容预览'
def media_files_display(self, obj):
"""显示媒体文件列表"""
media_files = obj.media_files.all()
if not media_files:
return "无媒体文件"
html = "<div style='max-height: 300px; overflow-y: auto;'>"
for media_file in media_files:
if media_file.media_type == 'image':
html += format_html(
'<div style="margin-bottom: 10px; border: 1px solid #ddd; padding: 5px;">'
'<strong>图片:</strong> {}<br>'
'<img src="/media/{}" style="max-width: 150px; max-height: 150px;" /><br>'
'<small>大小: {}</small>'
'</div>',
media_file.alt_text or '无标题',
media_file.local_file.name,
media_file.file_size_display
)
elif media_file.media_type == 'video':
html += format_html(
'<div style="margin-bottom: 10px; border: 1px solid #ddd; padding: 5px;">'
'<strong>视频:</strong><br>'
'<video controls style="max-width: 200px; max-height: 150px;">'
'<source src="/media/{}" type="{}">'
'</video><br>'
'<small>大小: {}</small>'
'</div>',
media_file.local_file.name,
media_file.mime_type,
media_file.file_size_display
)
else:
html += format_html(
'<div style="margin-bottom: 10px; border: 1px solid #ddd; padding: 5px;">'
'<strong>{}:</strong> <a href="/media/{}" download>下载</a><br>'
'<small>大小: {}</small>'
'</div>',
media_file.get_media_type_display(),
media_file.local_file.name,
media_file.file_size_display
)
html += "</div>"
return format_html(html)
media_files_display.short_description = '媒体文件'
@admin.register(CrawlLog)
class CrawlLogAdmin(admin.ModelAdmin):
list_display = ['level', 'message_short', 'website', 'task', 'created_at']
list_filter = ['level', 'website', 'task', 'created_at']
search_fields = ['message']
readonly_fields = ['created_at']
ordering = ['-created_at']
def message_short(self, obj):
"""显示缩短的消息"""
return obj.message[:100] + '...' if len(obj.message) > 100 else obj.message
message_short.short_description = '消息'
@admin.register(MediaFile)
class MediaFileAdmin(admin.ModelAdmin):
list_display = ['content', 'media_type', 'file_size_display', 'mime_type', 'created_at']
list_filter = ['media_type', 'created_at']
search_fields = ['content__title', 'original_url', 'alt_text']
readonly_fields = ['created_at', 'file_size_display', 'media_preview']
ordering = ['-created_at']
def media_preview(self, obj):
"""媒体文件预览"""
if obj.media_type == 'image' and obj.local_file:
return format_html(
'<img src="/media/{}" style="max-width: 200px; max-height: 200px;" />',
obj.local_file.name
)
elif obj.media_type == 'video' and obj.local_file:
return format_html(
'<video controls style="max-width: 200px; max-height: 200px;"><source src="/media/{}" type="{}"></video>',
obj.local_file.name,
obj.mime_type
)
elif obj.media_type == 'audio' and obj.local_file:
return format_html(
'<audio controls><source src="/media/{}" type="{}"></audio>',
obj.local_file.name,
obj.mime_type
)
else:
return "无预览"
media_preview.short_description = '预览'
@admin.register(SearchKeyword)
class SearchKeywordAdmin(admin.ModelAdmin):
list_display = ['keyword', 'is_active', 'created_at', 'last_used']
list_filter = ['is_active', 'created_at', 'last_used']
search_fields = ['keyword']
list_editable = ['is_active']
ordering = ['-last_used', '-created_at']

6
crawler/apps.py Normal file
View File

@@ -0,0 +1,6 @@
from django.apps import AppConfig
class CrawlerConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
name = 'crawler'

578
crawler/crawler_engine.py Normal file
View File

@@ -0,0 +1,578 @@
import requests
import time
import re
import logging
import os
import urllib3
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from django.conf import settings
from django.utils import timezone
from django.core.files.base import ContentFile
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from .models import Website, CrawlTask, CrawledContent, CrawlLog, SearchKeyword, MediaFile
# 禁用SSL警告
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# 设置日志记录器
logger = logging.getLogger(__name__)
class WebsiteCrawler:
"""网站爬虫引擎"""
def __init__(self, task_id):
self.task = CrawlTask.objects.get(id=task_id)
self.keywords = [kw.strip() for kw in self.task.keywords.split(',') if kw.strip()]
# 创建带重试策略的会话
self.session = requests.Session()
self.session.headers.update({
'User-Agent': settings.CRAWLER_SETTINGS['USER_AGENT']
})
# 设置重试策略
retry_strategy = Retry(
total=settings.CRAWLER_SETTINGS.get('MAX_RETRIES', 3),
backoff_factor=1,
status_forcelist=[429, 500, 502, 503, 504],
)
adapter = HTTPAdapter(max_retries=retry_strategy)
self.session.mount("http://", adapter)
self.session.mount("https://", adapter)
# 设置超时
self.timeout = settings.CRAWLER_SETTINGS['TIMEOUT']
def log(self, level, message, website=None):
"""记录日志"""
CrawlLog.objects.create(
task=self.task,
website=website,
level=level,
message=message
)
# 同时记录到Python日志系统
logger.log(getattr(logging, level.upper()), f"Task {self.task.id}: {message}")
def update_task_status(self, status, **kwargs):
"""更新任务状态"""
self.task.status = status
if status == 'running' and not self.task.started_at:
self.task.started_at = timezone.now()
elif status in ['completed', 'failed', 'cancelled']:
self.task.completed_at = timezone.now()
for key, value in kwargs.items():
setattr(self.task, key, value)
self.task.save()
def extract_text_content(self, soup):
"""提取文本内容,保持段落结构"""
# 移除脚本和样式标签
for script in soup(["script", "style"]):
script.decompose()
# 处理段落标签,保持段落结构
paragraphs = []
# 查找所有段落相关的标签
for element in soup.find_all(['p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'br']):
if element.name in ['p', 'div']:
text = element.get_text().strip()
if text:
paragraphs.append(text)
elif element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
text = element.get_text().strip()
if text:
paragraphs.append(f"\n{text}\n") # 标题前后加换行
elif element.name == 'br':
paragraphs.append('\n')
# 如果没有找到段落标签,使用原来的方法
if not paragraphs:
text = soup.get_text()
# 清理文本但保持换行
lines = []
for line in text.splitlines():
line = line.strip()
if line:
lines.append(line)
return '\n\n'.join(lines)
# 合并段落,用双换行分隔
content = '\n\n'.join(paragraphs)
# 清理多余的空行
import re
content = re.sub(r'\n\s*\n\s*\n', '\n\n', content)
return content.strip()
def find_article_links(self, soup, base_url):
"""查找文章链接"""
links = []
# 常见的文章链接选择器
selectors = [
'a[href*="article"]',
'a[href*="news"]',
'a[href*="content"]',
'a[href*="detail"]',
'a[href*="view"]',
'a[href*="show"]',
'.news-list a',
'.article-list a',
'.content-list a',
'h3 a',
'h4 a',
'.title a',
'.list-item a'
]
for selector in selectors:
elements = soup.select(selector)
for element in elements:
href = element.get('href')
if href:
full_url = urljoin(base_url, href)
title = element.get_text().strip()
if title and len(title) > 5: # 过滤掉太短的标题
links.append({
'url': full_url,
'title': title
})
return links
def check_keyword_match(self, text, title):
"""检查关键字匹配"""
matched_keywords = []
text_lower = text.lower()
title_lower = title.lower()
for keyword in self.keywords:
keyword_lower = keyword.lower()
if keyword_lower in text_lower or keyword_lower in title_lower:
matched_keywords.append(keyword)
return matched_keywords
def extract_article_content(self, url, soup):
"""提取文章内容"""
# 尝试多种内容选择器
content_selectors = [
'.article-content',
'.content',
'.article-body',
'.news-content',
'.main-content',
'.post-content',
'article',
'.detail-content',
'#content',
'.text'
]
content = ""
for selector in content_selectors:
element = soup.select_one(selector)
if element:
content = self.extract_text_content(element)
if len(content) > 100: # 确保内容足够长
break
# 如果没找到特定内容区域,使用整个页面
if not content or len(content) < 100:
content = self.extract_text_content(soup)
return content
def extract_publish_date(self, soup):
"""提取发布时间"""
date_selectors = [
'.publish-time',
'.pub-time',
'.date',
'.time',
'.publish-date',
'time[datetime]',
'.article-time',
'.news-time'
]
for selector in date_selectors:
element = soup.select_one(selector)
if element:
date_text = element.get_text().strip()
if element.get('datetime'):
date_text = element.get('datetime')
# 尝试解析日期
try:
from datetime import datetime
# 这里可以添加更复杂的日期解析逻辑
# 暂时返回当前时间
return timezone.now()
except:
continue
return None
def extract_author(self, soup):
"""提取作者信息"""
author_selectors = [
'.author',
'.writer',
'.publisher',
'.byline',
'.article-author',
'.news-author'
]
for selector in author_selectors:
element = soup.select_one(selector)
if element:
return element.get_text().strip()
return ""
def download_media_file(self, media_url, crawled_content, media_type='image', alt_text=''):
"""下载媒体文件"""
try:
# 检查URL是否有效
if not media_url or not media_url.startswith(('http://', 'https://')):
return None
# 请求媒体文件
response = self.session.get(
media_url,
timeout=self.timeout,
verify=False,
stream=False # 改为False以确保获取完整内容
)
response.raise_for_status()
# 获取文件信息
content_type = response.headers.get('content-type', '')
content_length = response.headers.get('content-length')
file_size = int(content_length) if content_length else len(response.content)
# 确定文件扩展名
file_extension = self.get_file_extension_from_url(media_url, content_type)
# 生成文件名
filename = f"media_{crawled_content.id}_{len(crawled_content.media_files.all())}{file_extension}"
# 创建媒体文件对象
media_file = MediaFile.objects.create(
content=crawled_content,
media_type=media_type,
original_url=media_url,
file_size=file_size,
mime_type=content_type,
alt_text=alt_text
)
# 保存文件
media_file.local_file.save(
filename,
ContentFile(response.content),
save=True
)
self.log('info', f'媒体文件已下载: {filename} ({media_type})', crawled_content.website)
return media_file
except Exception as e:
self.log('error', f'下载媒体文件失败 {media_url}: {str(e)}', crawled_content.website)
return None
def get_file_extension_from_url(self, url, content_type):
"""从URL或内容类型获取文件扩展名"""
# 从URL获取扩展名
parsed_url = urlparse(url)
path = parsed_url.path
if '.' in path:
return os.path.splitext(path)[1]
# 从内容类型获取扩展名
content_type_map = {
'image/jpeg': '.jpg',
'image/jpg': '.jpg',
'image/png': '.png',
'image/gif': '.gif',
'image/webp': '.webp',
'image/svg+xml': '.svg',
'video/mp4': '.mp4',
'video/avi': '.avi',
'video/mov': '.mov',
'video/wmv': '.wmv',
'video/flv': '.flv',
'video/webm': '.webm',
'audio/mp3': '.mp3',
'audio/wav': '.wav',
'audio/ogg': '.ogg',
'application/pdf': '.pdf',
'application/msword': '.doc',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
}
return content_type_map.get(content_type.lower(), '.bin')
def extract_and_download_media(self, soup, crawled_content, base_url):
"""提取并下载页面中的媒体文件"""
media_files = []
# 提取图片
images = soup.find_all('img')
self.log('info', f'找到 {len(images)} 个图片标签', crawled_content.website)
for img in images:
src = img.get('src')
if src:
# 处理相对URL
if src.startswith('//'):
src = 'https:' + src
elif src.startswith('/'):
src = urljoin(base_url, src)
elif not src.startswith(('http://', 'https://')):
src = urljoin(base_url, src)
alt_text = img.get('alt', '')
self.log('info', f'尝试下载图片: {src}', crawled_content.website)
media_file = self.download_media_file(src, crawled_content, 'image', alt_text)
if media_file:
media_files.append(media_file)
self.log('info', f'成功下载图片: {media_file.local_file.name}', crawled_content.website)
# 提取视频
videos = soup.find_all(['video', 'source'])
for video in videos:
src = video.get('src')
if src:
# 处理相对URL
if src.startswith('//'):
src = 'https:' + src
elif src.startswith('/'):
src = urljoin(base_url, src)
elif not src.startswith(('http://', 'https://')):
src = urljoin(base_url, src)
media_file = self.download_media_file(src, crawled_content, 'video')
if media_file:
media_files.append(media_file)
# 提取音频
audios = soup.find_all('audio')
for audio in audios:
src = audio.get('src')
if src:
# 处理相对URL
if src.startswith('//'):
src = 'https:' + src
elif src.startswith('/'):
src = urljoin(base_url, src)
elif not src.startswith(('http://', 'https://')):
src = urljoin(base_url, src)
media_file = self.download_media_file(src, crawled_content, 'audio')
if media_file:
media_files.append(media_file)
return media_files
def mark_content_saved(self, crawled_content):
"""标记内容已保存(内容已存储在数据库中)"""
try:
crawled_content.is_local_saved = True
crawled_content.save()
media_count = crawled_content.media_files.count()
self.log('info', f'文章内容已保存到数据库 (包含 {media_count} 个媒体文件)', crawled_content.website)
return True
except Exception as e:
self.log('error', f'标记内容保存状态失败: {str(e)}', crawled_content.website)
return False
def crawl_website(self, website):
"""爬取单个网站"""
self.log('info', f'开始爬取网站: {website.name}', website)
try:
# 请求主页
response = self.session.get(
website.url,
timeout=self.timeout,
verify=False # 忽略SSL证书验证
)
response.raise_for_status()
# 检查内容编码
if response.encoding != 'utf-8':
# 尝试从响应头获取编码
content_type = response.headers.get('content-type', '')
if 'charset=' in content_type:
charset = content_type.split('charset=')[-1]
response.encoding = charset
else:
response.encoding = 'utf-8'
soup = BeautifulSoup(response.content, 'html.parser')
# 查找文章链接
article_links = self.find_article_links(soup, website.url)
self.log('info', f'找到 {len(article_links)} 个文章链接', website)
crawled_count = 0
for link_info in article_links:
try:
# 请求文章页面
article_response = self.session.get(
link_info['url'],
timeout=self.timeout,
verify=False # 忽略SSL证书验证
)
article_response.raise_for_status()
# 检查内容编码
if article_response.encoding != 'utf-8':
# 尝试从响应头获取编码
content_type = article_response.headers.get('content-type', '')
if 'charset=' in content_type:
charset = content_type.split('charset=')[-1]
article_response.encoding = charset
else:
article_response.encoding = 'utf-8'
article_soup = BeautifulSoup(article_response.content, 'html.parser')
# 提取内容
content = self.extract_article_content(link_info['url'], article_soup)
title = link_info['title']
# 检查关键字匹配
matched_keywords = self.check_keyword_match(content, title)
if matched_keywords:
# 提取其他信息
publish_date = self.extract_publish_date(article_soup)
author = self.extract_author(article_soup)
# 保存内容
crawled_content = CrawledContent.objects.create(
task=self.task,
website=website,
title=title,
content=content,
url=link_info['url'],
publish_date=publish_date,
author=author,
keywords_matched=','.join(matched_keywords),
is_local_saved=False # 初始设置为False保存到本地后会更新为True
)
# 提取并下载媒体文件
media_files = self.extract_and_download_media(article_soup, crawled_content, link_info['url'])
# 标记内容已保存
self.mark_content_saved(crawled_content)
crawled_count += 1
self.log('info', f'保存文章: {title[:50]}...', website)
# 请求间隔
time.sleep(settings.CRAWLER_SETTINGS['REQUEST_DELAY'])
except requests.exceptions.SSLError as e:
self.log('error', f'SSL错误跳过文章 {link_info["url"]}: {str(e)}', website)
continue
except requests.exceptions.ConnectionError as e:
self.log('error', f'连接错误,跳过文章 {link_info["url"]}: {str(e)}', website)
continue
except requests.exceptions.Timeout as e:
self.log('error', f'请求超时,跳过文章 {link_info["url"]}: {str(e)}', website)
continue
except requests.exceptions.RequestException as e:
self.log('error', f'网络请求错误,跳过文章 {link_info["url"]}: {str(e)}', website)
continue
except UnicodeDecodeError as e:
self.log('error', f'字符编码错误,跳过文章 {link_info["url"]}: {str(e)}', website)
continue
except Exception as e:
self.log('error', f'处理文章失败 {link_info["url"]}: {str(e)}', website)
continue
self.log('info', f'网站爬取完成,共保存 {crawled_count} 篇文章', website)
return crawled_count
except requests.exceptions.SSLError as e:
self.log('error', f'爬取网站SSL错误: {str(e)}', website)
return 0
except requests.exceptions.ConnectionError as e:
self.log('error', f'爬取网站连接错误: {str(e)}', website)
return 0
except requests.exceptions.Timeout as e:
self.log('error', f'爬取网站超时: {str(e)}', website)
return 0
except requests.exceptions.RequestException as e:
self.log('error', f'爬取网站网络错误: {str(e)}', website)
return 0
except Exception as e:
self.log('error', f'爬取网站失败: {str(e)}', website)
return 0
def run(self):
"""运行爬取任务"""
self.log('info', f'开始执行爬取任务: {self.task.name}')
self.update_task_status('running')
total_crawled = 0
websites = self.task.websites.filter(is_active=True)
self.task.total_pages = websites.count()
self.task.save()
for website in websites:
try:
crawled_count = self.crawl_website(website)
total_crawled += crawled_count
self.task.crawled_pages += 1
self.task.save()
except Exception as e:
self.log('error', f'爬取网站 {website.name} 时发生错误: {str(e)}', website)
continue
# 更新任务状态
if total_crawled > 0:
self.update_task_status('completed')
self.log('info', f'爬取任务完成,共爬取 {total_crawled} 篇文章')
else:
self.update_task_status('failed', error_message='没有找到匹配的内容')
self.log('error', '爬取任务失败,没有找到匹配的内容')
def run_crawl_task(task_id):
"""运行爬取任务Celery任务"""
try:
crawler = WebsiteCrawler(task_id)
crawler.run()
return f"任务 {task_id} 执行完成"
except Exception as e:
# 记录异常到日志
logger.error(f"执行任务 {task_id} 时发生异常: {str(e)}", exc_info=True)
task = CrawlTask.objects.get(id=task_id)
task.status = 'failed'
task.error_message = str(e)
task.completed_at = timezone.now()
task.save()
CrawlLog.objects.create(
task=task,
level='error',
message=f'任务执行失败: {str(e)}'
)
return f"任务 {task_id} 执行失败: {str(e)}"

View File

View File

View File

@@ -0,0 +1,36 @@
from django.core.management.base import BaseCommand
from django.conf import settings
from crawler.models import Website
class Command(BaseCommand):
help = '初始化目标网站数据'
def handle(self, *args, **options):
self.stdout.write('开始初始化目标网站数据...')
# 清空现有数据
Website.objects.all().delete()
# 从设置中获取网站列表
websites_data = settings.TARGET_WEBSITES
created_count = 0
for website_data in websites_data:
website, created = Website.objects.get_or_create(
url=website_data['url'],
defaults={
'name': website_data['name'],
'region': website_data['region'],
'is_active': True
}
)
if created:
created_count += 1
self.stdout.write(f'创建网站: {website.name}')
else:
self.stdout.write(f'网站已存在: {website.name}')
self.stdout.write(
self.style.SUCCESS(f'成功初始化 {created_count} 个网站')
)

View File

@@ -0,0 +1,69 @@
from django.core.management.base import BaseCommand
from crawler.models import CrawlTask, Website
from crawler.tasks import crawl_websites_task
class Command(BaseCommand):
help = '运行爬虫任务'
def add_arguments(self, parser):
parser.add_argument(
'--keywords',
type=str,
required=True,
help='搜索关键字,多个关键字用逗号分隔'
)
parser.add_argument(
'--websites',
type=str,
help='网站ID列表用逗号分隔。不指定则爬取所有网站'
)
parser.add_argument(
'--name',
type=str,
help='任务名称'
)
def handle(self, *args, **options):
keywords = options['keywords']
website_ids = options.get('websites')
task_name = options.get('name', f'关键字搜索: {keywords}')
# 获取目标网站
if website_ids:
website_id_list = [int(id.strip()) for id in website_ids.split(',')]
websites = Website.objects.filter(id__in=website_id_list, is_active=True)
else:
websites = Website.objects.filter(is_active=True)
if not websites.exists():
self.stdout.write(
self.style.ERROR('没有找到可用的网站')
)
return
# 创建任务
task = CrawlTask.objects.create(
name=task_name,
keywords=keywords,
created_by='management_command'
)
task.websites.set(websites)
self.stdout.write(f'创建任务: {task.name}')
self.stdout.write(f'目标网站: {websites.count()}')
self.stdout.write(f'搜索关键字: {keywords}')
# 启动任务同步模式如果没有Redis则直接运行
try:
crawl_websites_task.delay(task.id)
self.stdout.write('任务已提交到队列')
except Exception as e:
self.stdout.write(f'队列不可用,直接运行任务: {e}')
from crawler.crawler_engine import WebsiteCrawler
crawler = WebsiteCrawler(task.id)
crawler.run()
self.stdout.write(
self.style.SUCCESS(f'任务已启动任务ID: {task.id}')
)

View File

@@ -0,0 +1,106 @@
# Generated by Django 5.2.6 on 2025-09-22 16:27
import django.db.models.deletion
from django.db import migrations, models
class Migration(migrations.Migration):
initial = True
dependencies = [
]
operations = [
migrations.CreateModel(
name='SearchKeyword',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('keyword', models.CharField(max_length=100, unique=True, verbose_name='关键字')),
('is_active', models.BooleanField(default=True, verbose_name='是否启用')),
('created_at', models.DateTimeField(auto_now_add=True, verbose_name='创建时间')),
('last_used', models.DateTimeField(blank=True, null=True, verbose_name='最后使用时间')),
],
options={
'verbose_name': '搜索关键字',
'verbose_name_plural': '搜索关键字',
'ordering': ['-last_used', '-created_at'],
},
),
migrations.CreateModel(
name='Website',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('name', models.CharField(max_length=100, verbose_name='网站名称')),
('url', models.URLField(verbose_name='网站地址')),
('region', models.CharField(max_length=50, verbose_name='所属地区')),
('is_active', models.BooleanField(default=True, verbose_name='是否启用')),
('created_at', models.DateTimeField(auto_now_add=True, verbose_name='创建时间')),
('updated_at', models.DateTimeField(auto_now=True, verbose_name='更新时间')),
],
options={
'verbose_name': '目标网站',
'verbose_name_plural': '目标网站',
'ordering': ['region', 'name'],
},
),
migrations.CreateModel(
name='CrawlTask',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('name', models.CharField(max_length=200, verbose_name='任务名称')),
('keywords', models.TextField(help_text='多个关键字用逗号分隔', verbose_name='搜索关键字')),
('status', models.CharField(choices=[('pending', '待执行'), ('running', '执行中'), ('completed', '已完成'), ('failed', '执行失败'), ('cancelled', '已取消')], default='pending', max_length=20, verbose_name='任务状态')),
('created_by', models.CharField(default='system', max_length=100, verbose_name='创建者')),
('created_at', models.DateTimeField(auto_now_add=True, verbose_name='创建时间')),
('started_at', models.DateTimeField(blank=True, null=True, verbose_name='开始时间')),
('completed_at', models.DateTimeField(blank=True, null=True, verbose_name='完成时间')),
('error_message', models.TextField(blank=True, verbose_name='错误信息')),
('total_pages', models.IntegerField(default=0, verbose_name='总页数')),
('crawled_pages', models.IntegerField(default=0, verbose_name='已爬取页数')),
('websites', models.ManyToManyField(to='crawler.website', verbose_name='目标网站')),
],
options={
'verbose_name': '爬取任务',
'verbose_name_plural': '爬取任务',
'ordering': ['-created_at'],
},
),
migrations.CreateModel(
name='CrawlLog',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('level', models.CharField(choices=[('info', '信息'), ('warning', '警告'), ('error', '错误'), ('debug', '调试')], max_length=20, verbose_name='日志级别')),
('message', models.TextField(verbose_name='日志消息')),
('created_at', models.DateTimeField(auto_now_add=True, verbose_name='记录时间')),
('task', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='logs', to='crawler.crawltask', verbose_name='所属任务')),
('website', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, to='crawler.website', verbose_name='相关网站')),
],
options={
'verbose_name': '爬取日志',
'verbose_name_plural': '爬取日志',
'ordering': ['-created_at'],
},
),
migrations.CreateModel(
name='CrawledContent',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('title', models.CharField(max_length=500, verbose_name='标题')),
('content', models.TextField(verbose_name='内容')),
('url', models.URLField(verbose_name='原文链接')),
('publish_date', models.DateTimeField(blank=True, null=True, verbose_name='发布时间')),
('author', models.CharField(blank=True, max_length=100, verbose_name='作者')),
('keywords_matched', models.TextField(help_text='匹配到的关键字,用逗号分隔', verbose_name='匹配的关键字')),
('created_at', models.DateTimeField(auto_now_add=True, verbose_name='爬取时间')),
('task', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='contents', to='crawler.crawltask', verbose_name='所属任务')),
('website', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='crawler.website', verbose_name='来源网站')),
],
options={
'verbose_name': '爬取内容',
'verbose_name_plural': '爬取内容',
'ordering': ['-created_at'],
'indexes': [models.Index(fields=['task', 'website'], name='crawler_cra_task_id_6244e7_idx'), models.Index(fields=['created_at'], name='crawler_cra_created_a116d2_idx'), models.Index(fields=['publish_date'], name='crawler_cra_publish_5b8ccc_idx')],
},
),
]

View File

@@ -0,0 +1,24 @@
# Generated by Django 5.2.6 on 2025-09-23 00:38
import crawler.models
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('crawler', '0001_initial'),
]
operations = [
migrations.AddField(
model_name='crawledcontent',
name='is_local_saved',
field=models.BooleanField(default=False, verbose_name='是否已本地保存'),
),
migrations.AddField(
model_name='crawledcontent',
name='local_file',
field=models.FileField(blank=True, null=True, upload_to=crawler.models.crawled_content_file_path, verbose_name='本地文件'),
),
]

View File

@@ -0,0 +1,35 @@
# Generated by Django 5.2.6 on 2025-09-23 01:05
import crawler.models
import django.db.models.deletion
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('crawler', '0002_crawledcontent_is_local_saved_and_more'),
]
operations = [
migrations.CreateModel(
name='MediaFile',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('media_type', models.CharField(choices=[('image', '图片'), ('video', '视频'), ('audio', '音频'), ('document', '文档')], max_length=20, verbose_name='媒体类型')),
('original_url', models.URLField(verbose_name='原始URL')),
('local_file', models.FileField(upload_to=crawler.models.media_file_path, verbose_name='本地文件')),
('file_size', models.BigIntegerField(blank=True, null=True, verbose_name='文件大小(字节)')),
('mime_type', models.CharField(blank=True, max_length=100, verbose_name='MIME类型')),
('alt_text', models.CharField(blank=True, max_length=500, verbose_name='替代文本')),
('created_at', models.DateTimeField(auto_now_add=True, verbose_name='创建时间')),
('content', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='media_files', to='crawler.crawledcontent', verbose_name='所属内容')),
],
options={
'verbose_name': '媒体文件',
'verbose_name_plural': '媒体文件',
'ordering': ['-created_at'],
'indexes': [models.Index(fields=['content', 'media_type'], name='crawler_med_content_3a9468_idx'), models.Index(fields=['created_at'], name='crawler_med_created_13ff00_idx')],
},
),
]

View File

195
crawler/models.py Normal file
View File

@@ -0,0 +1,195 @@
from django.db import models
from django.utils import timezone
import os
def crawled_content_file_path(instance, filename):
"""生成爬取内容文件的存储路径"""
# 使用任务ID和时间戳创建唯一文件名
timestamp = timezone.now().strftime('%Y%m%d_%H%M%S')
name, ext = os.path.splitext(filename)
return f'crawled_content/{instance.task.id}/{timestamp}_{instance.id}{ext}'
def media_file_path(instance, filename):
"""生成媒体文件的存储路径"""
# 使用任务ID和内容ID创建媒体文件路径
timestamp = timezone.now().strftime('%Y%m%d_%H%M%S')
name, ext = os.path.splitext(filename)
return f'{instance.content.task.id}/{instance.content.id}/{timestamp}_{name}{ext}'
class Website(models.Model):
"""目标网站模型"""
name = models.CharField(max_length=100, verbose_name='网站名称')
url = models.URLField(verbose_name='网站地址')
region = models.CharField(max_length=50, verbose_name='所属地区')
is_active = models.BooleanField(default=True, verbose_name='是否启用')
created_at = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')
updated_at = models.DateTimeField(auto_now=True, verbose_name='更新时间')
class Meta:
verbose_name = '目标网站'
verbose_name_plural = '目标网站'
ordering = ['region', 'name']
def __str__(self):
return f"{self.region} - {self.name}"
class CrawlTask(models.Model):
"""爬取任务模型"""
TASK_STATUS_CHOICES = [
('pending', '待执行'),
('running', '执行中'),
('completed', '已完成'),
('failed', '执行失败'),
('cancelled', '已取消'),
]
name = models.CharField(max_length=200, verbose_name='任务名称')
keywords = models.TextField(verbose_name='搜索关键字', help_text='多个关键字用逗号分隔')
websites = models.ManyToManyField(Website, verbose_name='目标网站')
status = models.CharField(max_length=20, choices=TASK_STATUS_CHOICES, default='pending', verbose_name='任务状态')
created_by = models.CharField(max_length=100, verbose_name='创建者', default='system')
created_at = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')
started_at = models.DateTimeField(null=True, blank=True, verbose_name='开始时间')
completed_at = models.DateTimeField(null=True, blank=True, verbose_name='完成时间')
error_message = models.TextField(blank=True, verbose_name='错误信息')
total_pages = models.IntegerField(default=0, verbose_name='总页数')
crawled_pages = models.IntegerField(default=0, verbose_name='已爬取页数')
class Meta:
verbose_name = '爬取任务'
verbose_name_plural = '爬取任务'
ordering = ['-created_at']
def __str__(self):
return f"{self.name} - {self.get_status_display()}"
@property
def progress_percentage(self):
"""计算任务进度百分比"""
if self.total_pages == 0:
return 0
return round((self.crawled_pages / self.total_pages) * 100, 2)
class CrawledContent(models.Model):
"""爬取内容模型"""
task = models.ForeignKey(CrawlTask, on_delete=models.CASCADE, related_name='contents', verbose_name='所属任务')
website = models.ForeignKey(Website, on_delete=models.CASCADE, verbose_name='来源网站')
title = models.CharField(max_length=500, verbose_name='标题')
content = models.TextField(verbose_name='内容')
url = models.URLField(verbose_name='原文链接')
publish_date = models.DateTimeField(null=True, blank=True, verbose_name='发布时间')
author = models.CharField(max_length=100, blank=True, verbose_name='作者')
keywords_matched = models.TextField(verbose_name='匹配的关键字', help_text='匹配到的关键字,用逗号分隔')
created_at = models.DateTimeField(auto_now_add=True, verbose_name='爬取时间')
# 添加本地存储字段
local_file = models.FileField(upload_to=crawled_content_file_path, blank=True, null=True, verbose_name='本地文件')
is_local_saved = models.BooleanField(default=False, verbose_name='是否已本地保存')
class Meta:
verbose_name = '爬取内容'
verbose_name_plural = '爬取内容'
ordering = ['-created_at']
indexes = [
models.Index(fields=['task', 'website']),
models.Index(fields=['created_at']),
models.Index(fields=['publish_date']),
]
def __str__(self):
return f"{self.website.name} - {self.title[:50]}"
def get_preview_content(self, max_length=500):
"""获取预览内容"""
if len(self.content) <= max_length:
return self.content
return self.content[:max_length] + '...'
class MediaFile(models.Model):
"""媒体文件模型"""
MEDIA_TYPE_CHOICES = [
('image', '图片'),
('video', '视频'),
('audio', '音频'),
('document', '文档'),
]
content = models.ForeignKey(CrawledContent, on_delete=models.CASCADE, related_name='media_files', verbose_name='所属内容')
media_type = models.CharField(max_length=20, choices=MEDIA_TYPE_CHOICES, verbose_name='媒体类型')
original_url = models.URLField(verbose_name='原始URL')
local_file = models.FileField(upload_to=media_file_path, verbose_name='本地文件')
file_size = models.BigIntegerField(null=True, blank=True, verbose_name='文件大小(字节)')
mime_type = models.CharField(max_length=100, blank=True, verbose_name='MIME类型')
alt_text = models.CharField(max_length=500, blank=True, verbose_name='替代文本')
created_at = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')
class Meta:
verbose_name = '媒体文件'
verbose_name_plural = '媒体文件'
ordering = ['-created_at']
indexes = [
models.Index(fields=['content', 'media_type']),
models.Index(fields=['created_at']),
]
def __str__(self):
return f"{self.get_media_type_display()} - {self.original_url}"
@property
def file_size_display(self):
"""显示文件大小"""
if not self.file_size:
return "未知"
size = self.file_size
for unit in ['B', 'KB', 'MB', 'GB']:
if size < 1024.0:
return f"{size:.1f} {unit}"
size /= 1024.0
return f"{size:.1f} TB"
class CrawlLog(models.Model):
"""爬取日志模型"""
LOG_LEVEL_CHOICES = [
('info', '信息'),
('warning', '警告'),
('error', '错误'),
('debug', '调试'),
]
task = models.ForeignKey(CrawlTask, on_delete=models.CASCADE, related_name='logs', verbose_name='所属任务')
website = models.ForeignKey(Website, on_delete=models.CASCADE, null=True, blank=True, verbose_name='相关网站')
level = models.CharField(max_length=20, choices=LOG_LEVEL_CHOICES, verbose_name='日志级别')
message = models.TextField(verbose_name='日志消息')
created_at = models.DateTimeField(auto_now_add=True, verbose_name='记录时间')
class Meta:
verbose_name = '爬取日志'
verbose_name_plural = '爬取日志'
ordering = ['-created_at']
def __str__(self):
return f"[{self.get_level_display()}] {self.message[:100]}"
class SearchKeyword(models.Model):
"""搜索关键字模型"""
keyword = models.CharField(max_length=100, unique=True, verbose_name='关键字')
is_active = models.BooleanField(default=True, verbose_name='是否启用')
created_at = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')
last_used = models.DateTimeField(null=True, blank=True, verbose_name='最后使用时间')
class Meta:
verbose_name = '搜索关键字'
verbose_name_plural = '搜索关键字'
ordering = ['-last_used', '-created_at']
def __str__(self):
return self.keyword

51
crawler/serializers.py Normal file
View File

@@ -0,0 +1,51 @@
from rest_framework import serializers
from .models import Website, CrawlTask, CrawledContent, CrawlLog, SearchKeyword, MediaFile
class WebsiteSerializer(serializers.ModelSerializer):
class Meta:
model = Website
fields = '__all__'
class CrawlTaskSerializer(serializers.ModelSerializer):
websites = WebsiteSerializer(many=True, read_only=True)
progress_percentage = serializers.ReadOnlyField()
class Meta:
model = CrawlTask
fields = '__all__'
class MediaFileSerializer(serializers.ModelSerializer):
file_size_display = serializers.ReadOnlyField()
class Meta:
model = MediaFile
fields = '__all__'
class CrawledContentSerializer(serializers.ModelSerializer):
website_name = serializers.CharField(source='website.name', read_only=True)
website_region = serializers.CharField(source='website.region', read_only=True)
task_name = serializers.CharField(source='task.name', read_only=True)
media_files = MediaFileSerializer(many=True, read_only=True)
class Meta:
model = CrawledContent
fields = '__all__'
class CrawlLogSerializer(serializers.ModelSerializer):
website_name = serializers.CharField(source='website.name', read_only=True)
task_name = serializers.CharField(source='task.name', read_only=True)
class Meta:
model = CrawlLog
fields = '__all__'
class SearchKeywordSerializer(serializers.ModelSerializer):
class Meta:
model = SearchKeyword
fields = '__all__'

36
crawler/tasks.py Normal file
View File

@@ -0,0 +1,36 @@
from celery import shared_task
from .crawler_engine import run_crawl_task as execute_crawl_task
@shared_task
def crawl_websites_task(task_id):
"""爬取网站的Celery任务"""
return execute_crawl_task(task_id)
@shared_task
def run_crawl_task(task_id):
"""执行爬取任务的Celery任务为管理界面提供"""
return execute_crawl_task(task_id)
@shared_task
def cleanup_old_tasks():
"""清理旧任务保留最近30天的任务"""
from django.utils import timezone
from datetime import timedelta
from .models import CrawlTask, CrawlLog, CrawledContent
cutoff_date = timezone.now() - timedelta(days=30)
# 删除30天前的任务及其相关数据
old_tasks = CrawlTask.objects.filter(created_at__lt=cutoff_date)
count = old_tasks.count()
for task in old_tasks:
# 删除相关的内容和日志
CrawledContent.objects.filter(task=task).delete()
CrawlLog.objects.filter(task=task).delete()
task.delete()
return f"清理了 {count} 个旧任务"

View File

@@ -0,0 +1,80 @@
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>{% block title %}网站爬虫系统{% endblock %}</title>
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/css/bootstrap.min.css" rel="stylesheet">
<link href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.7.2/font/bootstrap-icons.css" rel="stylesheet">
<style>
.navbar-brand {
font-weight: bold;
}
.stats-card {
transition: transform 0.2s;
}
.stats-card:hover {
transform: translateY(-2px);
}
.content-preview {
max-height: 100px;
overflow: hidden;
text-overflow: ellipsis;
}
.keyword-badge {
background-color: #e3f2fd;
color: #1976d2;
padding: 2px 8px;
border-radius: 12px;
font-size: 0.8em;
margin-right: 5px;
}
</style>
</head>
<body>
<nav class="navbar navbar-expand-lg navbar-dark bg-primary">
<div class="container">
<a class="navbar-brand" href="{% url 'dashboard' %}">
<i class="bi bi-search"></i> 网站爬虫系统
</a>
<button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbarNav">
<span class="navbar-toggler-icon"></span>
</button>
<div class="collapse navbar-collapse" id="navbarNav">
<ul class="navbar-nav me-auto">
<li class="nav-item">
<a class="nav-link" href="{% url 'dashboard' %}">
<i class="bi bi-house"></i> 仪表板
</a>
</li>
<li class="nav-item">
<a class="nav-link" href="{% url 'search' %}">
<i class="bi bi-search"></i> 搜索
</a>
</li>
<li class="nav-item">
<a class="nav-link" href="/admin/">
<i class="bi bi-gear"></i> 管理后台
</a>
</li>
</ul>
</div>
</div>
</nav>
<main class="container mt-4">
{% block content %}
{% endblock %}
</main>
<footer class="bg-light mt-5 py-4">
<div class="container text-center">
<p class="text-muted mb-0">网站爬虫系统 &copy; 2024</p>
</div>
</footer>
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/js/bootstrap.bundle.min.js"></script>
{% block extra_js %}
{% endblock %}
</body>
</html>

View File

@@ -0,0 +1,320 @@
{% extends 'crawler/base.html' %}
{% load custom_filters %}
{% block title %}仪表板 - 网站爬虫系统{% endblock %}
{% block content %}
<div class="row">
<div class="col-12">
<h1 class="mb-4">
<i class="bi bi-speedometer2"></i> 系统仪表板
</h1>
</div>
</div>
<!-- 统计卡片 -->
<div class="row mb-4">
<div class="col-md-3 mb-3">
<div class="card stats-card bg-primary text-white">
<div class="card-body">
<div class="d-flex justify-content-between">
<div>
<h4 class="card-title">{{ stats.total_websites }}</h4>
<p class="card-text">监控网站</p>
</div>
<div class="align-self-center">
<i class="bi bi-globe fs-1"></i>
</div>
</div>
</div>
</div>
</div>
<div class="col-md-3 mb-3">
<div class="card stats-card bg-success text-white">
<div class="card-body">
<div class="d-flex justify-content-between">
<div>
<h4 class="card-title">{{ stats.total_tasks }}</h4>
<p class="card-text">爬取任务</p>
</div>
<div class="align-self-center">
<i class="bi bi-list-task fs-1"></i>
</div>
</div>
</div>
</div>
</div>
<div class="col-md-3 mb-3">
<div class="card stats-card bg-info text-white">
<div class="card-body">
<div class="d-flex justify-content-between">
<div>
<h4 class="card-title">{{ stats.total_contents }}</h4>
<p class="card-text">爬取内容</p>
</div>
<div class="align-self-center">
<i class="bi bi-file-text fs-1"></i>
</div>
</div>
</div>
</div>
</div>
<div class="col-md-3 mb-3">
<div class="card stats-card bg-warning text-white">
<div class="card-body">
<div class="d-flex justify-content-between">
<div>
<h4 class="card-title">{{ stats.active_tasks }}</h4>
<p class="card-text">运行中任务</p>
</div>
<div class="align-self-center">
<i class="bi bi-arrow-clockwise fs-1"></i>
</div>
</div>
</div>
</div>
</div>
</div>
<div class="row">
<!-- 网站筛选和分页控制 -->
<div class="col-12 mb-3">
<div class="card">
<div class="card-body">
<div class="row">
<div class="col-md-6">
<form method="get" class="d-flex">
<select name="website" class="form-select me-2" onchange="this.form.submit()">
<option value="">所有网站</option>
{% for website in stats.websites %}
<option value="{{ website.id }}" {% if website.id == stats.selected_website_id %}selected{% endif %}>
{{ website.name }} ({{ website.region }})
</option>
{% endfor %}
</select>
<select name="page_size" class="form-select me-2" onchange="this.form.submit()">
<option value="10" {% if stats.page_size == 10 %}selected{% endif %}>10条/页</option>
<option value="20" {% if stats.page_size == 20 %}selected{% endif %}>20条/页</option>
<option value="50" {% if stats.page_size == 50 %}selected{% endif %}>50条/页</option>
<option value="100" {% if stats.page_size == 100 %}selected{% endif %}>100条/页</option>
</select>
<noscript>
<button type="submit" class="btn btn-primary">应用</button>
</noscript>
</form>
</div>
<div class="col-md-6">
<!-- 分页导航 -->
{% if stats.page_obj.has_other_pages %}
<nav aria-label="页面导航">
<ul class="pagination justify-content-end mb-0">
{% if stats.page_obj.has_previous %}
<li class="page-item">
<a class="page-link" href="?page={{ stats.page_obj.previous_page_number }}{% if stats.selected_website_id %}&website={{ stats.selected_website_id }}{% endif %}{% if stats.page_size %}&page_size={{ stats.page_size }}{% endif %}" aria-label="上一页">
<span aria-hidden="true">&laquo;</span>
</a>
</li>
{% endif %}
{% for num in stats.page_obj.paginator.page_range %}
{% if stats.page_obj.number == num %}
<li class="page-item active">
<span class="page-link">{{ num }}</span>
</li>
{% elif num > stats.page_obj.number|add:'-3' and num < stats.page_obj.number|add:'3' %}
<li class="page-item">
<a class="page-link" href="?page={{ num }}{% if stats.selected_website_id %}&website={{ stats.selected_website_id }}{% endif %}{% if stats.page_size %}&page_size={{ stats.page_size }}{% endif %}">{{ num }}</a>
</li>
{% endif %}
{% endfor %}
{% if stats.page_obj.has_next %}
<li class="page-item">
<a class="page-link" href="?page={{ stats.page_obj.next_page_number }}{% if stats.selected_website_id %}&website={{ stats.selected_website_id }}{% endif %}{% if stats.page_size %}&page_size={{ stats.page_size }}{% endif %}" aria-label="下一页">
<span aria-hidden="true">&raquo;</span>
</a>
</li>
{% endif %}
</ul>
</nav>
{% endif %}
</div>
</div>
</div>
</div>
</div>
<!-- 按网站分类显示内容 -->
<div class="col-md-8">
{% for website_name, contents in stats.contents_by_website.items %}
<div class="card mb-4">
<div class="card-header">
<h5 class="card-title mb-0">
<i class="bi bi-globe"></i> {{ website_name }}
<span class="badge bg-secondary">{{ contents|length }}</span>
</h5>
</div>
<div class="card-body">
<div class="list-group list-group-flush">
{% for content in contents %}
<div class="list-group-item">
<div class="d-flex w-100 justify-content-between">
<h6 class="mb-1">
{% if content.is_local_saved %}
<a href="{% url 'preview_crawled_content' content.id %}" target="_blank" class="text-decoration-none">
{{ content.title|truncatechars:60 }}
</a>
{% else %}
<a href="{{ content.url }}" target="_blank" class="text-decoration-none">
{{ content.title|truncatechars:60 }}
</a>
{% endif %}
</h6>
<small class="text-muted">{{ content.created_at|date:"m-d H:i" }}</small>
</div>
<p class="mb-1 content-preview">{{ content.content|truncatechars:100 }}</p>
<div class="d-flex justify-content-between align-items-center">
<small class="text-muted">
<i class="bi bi-geo-alt"></i> {{ content.website.region }}
{% if content.media_files.count > 0 %}
| <i class="bi bi-image"></i> {{ content.media_files.count }} 个媒体文件
{% endif %}
</small>
<div>
{% for keyword in content.keywords_matched|split:"," %}
<span class="keyword-badge">{{ keyword|strip }}</span>
{% endfor %}
</div>
</div>
</div>
{% endfor %}
</div>
</div>
</div>
{% empty %}
<div class="card">
<div class="card-body text-center">
<p class="text-muted py-3">暂无爬取内容</p>
</div>
</div>
{% endfor %}
<!-- 分页信息 -->
{% if stats.page_obj.has_other_pages %}
<div class="card">
<div class="card-body">
<div class="d-flex justify-content-between align-items-center">
<div>
显示第 {{ stats.page_obj.start_index }} 到 {{ stats.page_obj.end_index }} 条,共 {{ stats.page_obj.paginator.count }} 条记录
</div>
<div>
<!-- 分页导航(重复显示,方便用户操作) -->
<nav aria-label="页面导航">
<ul class="pagination mb-0">
{% if stats.page_obj.has_previous %}
<li class="page-item">
<a class="page-link" href="?page={{ stats.page_obj.previous_page_number }}{% if stats.selected_website_id %}&website={{ stats.selected_website_id }}{% endif %}{% if stats.page_size %}&page_size={{ stats.page_size }}{% endif %}" aria-label="上一页">
<span aria-hidden="true">&laquo;</span>
</a>
</li>
{% endif %}
{% for num in stats.page_obj.paginator.page_range %}
{% if stats.page_obj.number == num %}
<li class="page-item active">
<span class="page-link">{{ num }}</span>
</li>
{% elif num > stats.page_obj.number|add:'-3' and num < stats.page_obj.number|add:'3' %}
<li class="page-item">
<a class="page-link" href="?page={{ num }}{% if stats.selected_website_id %}&website={{ stats.selected_website_id }}{% endif %}{% if stats.page_size %}&page_size={{ stats.page_size }}{% endif %}">{{ num }}</a>
</li>
{% endif %}
{% endfor %}
{% if stats.page_obj.has_next %}
<li class="page-item">
<a class="page-link" href="?page={{ stats.page_obj.next_page_number }}{% if stats.selected_website_id %}&website={{ stats.selected_website_id }}{% endif %}{% if stats.page_size %}&page_size={{ stats.page_size }}{% endif %}" aria-label="下一页">
<span aria-hidden="true">&raquo;</span>
</a>
</li>
{% endif %}
</ul>
</nav>
</div>
</div>
</div>
</div>
{% endif %}
</div>
<!-- 最近的任务 -->
<div class="col-md-4">
<div class="card">
<div class="card-header">
<h5 class="card-title mb-0">
<i class="bi bi-list-check"></i> 最近的任务
</h5>
</div>
<div class="card-body">
{% if stats.recent_tasks %}
<div class="list-group list-group-flush">
{% for task in stats.recent_tasks %}
<div class="list-group-item">
<div class="d-flex w-100 justify-content-between">
<h6 class="mb-1">{{ task.name|truncatechars:30 }}</h6>
<span class="badge bg-{% if task.status == 'completed' %}success{% elif task.status == 'failed' %}danger{% elif task.status == 'running' %}warning{% else %}secondary{% endif %}">
{{ task.get_status_display }}
</span>
</div>
<p class="mb-1">
<small class="text-muted">关键字: {{ task.keywords|truncatechars:40 }}</small>
</p>
<small class="text-muted">{{ task.created_at|date:"m-d H:i" }}</small>
</div>
{% endfor %}
</div>
{% else %}
<p class="text-muted text-center py-3">暂无任务</p>
{% endif %}
</div>
</div>
</div>
</div>
<!-- 快速操作 -->
<div class="row mt-4">
<div class="col-12">
<div class="card">
<div class="card-header">
<h5 class="card-title mb-0">
<i class="bi bi-lightning"></i> 快速操作
</h5>
</div>
<div class="card-body">
<div class="row">
<div class="col-md-4 mb-3">
<a href="{% url 'search' %}" class="btn btn-primary w-100">
<i class="bi bi-search"></i> 搜索内容
</a>
</div>
<div class="col-md-4 mb-3">
<a href="/admin/crawler/crawltask/add/" class="btn btn-success w-100">
<i class="bi bi-plus-circle"></i> 创建任务
</a>
</div>
<div class="col-md-4 mb-3">
<a href="/admin/" class="btn btn-outline-secondary w-100">
<i class="bi bi-gear"></i> 管理后台
</a>
</div>
</div>
</div>
</div>
</div>
</div>
{% endblock %}

View File

@@ -0,0 +1,128 @@
{% extends 'crawler/base.html' %}
{% load custom_filters %}
{% block title %}搜索内容 - 网站爬虫系统{% endblock %}
{% block content %}
<div class="row">
<div class="col-12">
<h1 class="mb-4">
<i class="bi bi-search"></i> 内容搜索
</h1>
</div>
</div>
<!-- 搜索表单 -->
<div class="row mb-4">
<div class="col-12">
<div class="card">
<div class="card-body">
<form method="get" action="{% url 'search' %}">
<div class="input-group input-group-lg">
<input type="text"
class="form-control"
name="q"
value="{{ keyword }}"
placeholder="输入关键字搜索内容..."
required>
<button class="btn btn-primary" type="submit">
<i class="bi bi-search"></i> 搜索
</button>
</div>
</form>
</div>
</div>
</div>
</div>
<!-- 搜索结果 -->
{% if keyword %}
<div class="row">
<div class="col-12">
<div class="card">
<div class="card-header">
<h5 class="card-title mb-0">
<i class="bi bi-list-ul"></i> 搜索结果
{% if contents %}
<span class="badge bg-primary ms-2">{{ contents|length }} 条结果</span>
{% endif %}
</h5>
</div>
<div class="card-body">
{% if contents %}
<div class="list-group list-group-flush">
{% for content in contents %}
<div class="list-group-item">
<div class="d-flex w-100 justify-content-between">
<h5 class="mb-1">
{% if content.is_local_saved %}
<a href="{% url 'preview_crawled_content' content.id %}" target="_blank" class="text-decoration-none">
{{ content.title }}
</a>
{% else %}
<a href="{{ content.url }}" target="_blank" class="text-decoration-none">
{{ content.title }}
</a>
{% endif %}
</h5>
<small class="text-muted">{{ content.created_at|date:"Y-m-d H:i" }}</small>
</div>
<p class="mb-2 content-preview">{{ content.content|truncatechars:200 }}</p>
<div class="d-flex justify-content-between align-items-center">
<small class="text-muted">
<i class="bi bi-geo-alt"></i> {{ content.website.region }} - {{ content.website.name }}
{% if content.author %}
| <i class="bi bi-person"></i> {{ content.author }}
{% endif %}
{% if content.publish_date %}
| <i class="bi bi-calendar"></i> {{ content.publish_date|date:"Y-m-d" }}
{% endif %}
{% if content.media_files.count > 0 %}
| <i class="bi bi-image"></i> {{ content.media_files.count }} 个媒体文件
{% endif %}
</small>
<div>
{% for keyword in content.keywords_matched|split:"," %}
<span class="keyword-badge">{{ keyword|strip }}</span>
{% endfor %}
</div>
</div>
</div>
{% endfor %}
</div>
{% else %}
<div class="text-center py-5">
<i class="bi bi-search fs-1 text-muted"></i>
<p class="text-muted mt-3">没有找到包含 "{{ keyword }}" 的内容</p>
<p class="text-muted">请尝试其他关键字或检查爬取任务是否正常运行</p>
</div>
{% endif %}
</div>
</div>
</div>
</div>
{% else %}
<!-- 搜索提示 -->
<div class="row">
<div class="col-12">
<div class="card">
<div class="card-body text-center py-5">
<i class="bi bi-search fs-1 text-muted"></i>
<h4 class="text-muted mt-3">开始搜索</h4>
<p class="text-muted">在上方输入框中输入关键字,搜索已爬取的内容</p>
<div class="mt-4">
<h6>搜索建议:</h6>
<div class="d-flex flex-wrap justify-content-center gap-2">
<span class="badge bg-light text-dark">反腐败</span>
<span class="badge bg-light text-dark">纪律检查</span>
<span class="badge bg-light text-dark">监督</span>
<span class="badge bg-light text-dark">廉政</span>
<span class="badge bg-light text-dark">违纪</span>
</div>
</div>
</div>
</div>
</div>
</div>
{% endif %}
{% endblock %}

View File

View File

@@ -0,0 +1,32 @@
from django import template
register = template.Library()
@register.filter
def split(value, separator=','):
"""Split a string by separator"""
if not value:
return []
return value.split(separator)
@register.filter
def strip(value):
"""Strip whitespace from a string"""
if not value:
return ''
return value.strip()
@register.filter
def div(value, divisor):
"""Divide value by divisor"""
try:
value = float(value)
divisor = float(divisor)
if divisor == 0:
return 0
return value / divisor
except (ValueError, TypeError):
return 0

3
crawler/tests.py Normal file
View File

@@ -0,0 +1,3 @@
from django.test import TestCase
# Create your tests here.

8
crawler/urls.py Normal file
View File

@@ -0,0 +1,8 @@
from django.urls import path, include
from . import views
urlpatterns = [
path('', views.dashboard, name='dashboard'),
path('search/', views.search_page, name='search'),
path('crawled-content/<int:content_id>/preview/', views.preview_crawled_content, name='preview_crawled_content'),
]

292
crawler/views.py Normal file
View File

@@ -0,0 +1,292 @@
from django.shortcuts import render, get_object_or_404
from django.http import HttpResponse
from django.db.models import Q, Count
from django.conf import settings
from django.utils import timezone
from .models import Website, CrawlTask, CrawledContent, CrawlLog, SearchKeyword
from rest_framework import viewsets, filters
from rest_framework.decorators import action
from rest_framework.response import Response
from .serializers import (
WebsiteSerializer, CrawlTaskSerializer, CrawledContentSerializer,
CrawlLogSerializer, SearchKeywordSerializer
)
import json
from django.core.paginator import Paginator
from django.db.models.functions import TruncDate
from django.db.models import Count
def dashboard(request):
"""仪表板视图"""
# 统计数据
total_websites = Website.objects.filter(is_active=True).count()
total_tasks = CrawlTask.objects.count()
total_contents = CrawledContent.objects.count()
active_tasks = CrawlTask.objects.filter(status='running').count()
# 获取所有网站
websites = Website.objects.filter(is_active=True).order_by('name')
# 获取当前选中的网站ID
selected_website_id = request.GET.get('website')
# 获取分页参数
page_number = request.GET.get('page', 1)
page_size = request.GET.get('page_size', 20) # 默认每页20篇文章
# 尝试转换page_size为整数
try:
page_size = int(page_size)
# 限制page_size在合理范围内
page_size = max(10, min(100, page_size))
except (ValueError, TypeError):
page_size = 20
# 获取所有爬取的内容,按网站和创建时间排序
all_contents = CrawledContent.objects.select_related('website').order_by('website__name', '-created_at')
# 如果选择了特定网站,则进行过滤
if selected_website_id:
try:
selected_website_id = int(selected_website_id)
all_contents = all_contents.filter(website_id=selected_website_id)
except (ValueError, TypeError):
pass
# 分页处理
paginator = Paginator(all_contents, page_size)
page_obj = paginator.get_page(page_number)
# 按网站分组内容
contents_by_website = {}
for content in page_obj:
website_name = content.website.name
if website_name not in contents_by_website:
contents_by_website[website_name] = []
contents_by_website[website_name].append(content)
# 最近的任务
recent_tasks = CrawlTask.objects.order_by('-created_at')[:5]
# 媒体文件统计
total_media_files = CrawledContent.objects.aggregate(
total_media=Count('media_files')
)['total_media'] or 0
stats = {
'total_websites': total_websites,
'total_tasks': total_tasks,
'total_contents': total_contents,
'active_tasks': active_tasks,
'websites': websites,
'selected_website_id': selected_website_id,
'page_obj': page_obj,
'contents_by_website': contents_by_website,
'page_size': page_size,
'recent_tasks': recent_tasks,
'total_media_files': total_media_files,
}
return render(request, 'crawler/dashboard.html', {'stats': stats})
def search_page(request):
"""搜索页面视图"""
keyword = request.GET.get('q', '').strip()
contents = []
if keyword:
# 记录搜索关键字
SearchKeyword.objects.get_or_create(
keyword=keyword,
defaults={'last_used': timezone.now()}
)
# 搜索内容
contents = CrawledContent.objects.filter(
Q(title__icontains=keyword) |
Q(content__icontains=keyword) |
Q(keywords_matched__icontains=keyword)
).order_by('-created_at')[:50]
return render(request, 'crawler/search.html', {
'keyword': keyword,
'contents': contents
})
def preview_crawled_content(request, content_id):
"""预览爬取的内容"""
content = get_object_or_404(CrawledContent, id=content_id)
# 获取媒体文件
media_files = content.media_files.all()
# 生成媒体文件HTML
media_section = ""
if media_files:
media_section = """
<div class="media-section">
<h3>媒体文件</h3>
"""
for media_file in media_files:
if media_file.media_type == 'image':
media_section += f"""
<div class="media-item">
<h4>图片: {media_file.alt_text or '无标题'}</h4>
<img src="/media/{media_file.local_file.name}" alt="{media_file.alt_text}" style="max-width: 100%; height: auto;">
<p><small>原始URL: <a href="{media_file.original_url}" target="_blank">{media_file.original_url}</a></small></p>
<p><small>文件大小: {media_file.file_size_display}</small></p>
</div>
"""
elif media_file.media_type == 'video':
media_section += f"""
<div class="media-item">
<h4>视频</h4>
<video controls style="max-width: 100%;">
<source src="/media/{media_file.local_file.name}" type="{media_file.mime_type}">
您的浏览器不支持视频播放。
</video>
<p><small>原始URL: <a href="{media_file.original_url}" target="_blank">{media_file.original_url}</a></small></p>
<p><small>文件大小: {media_file.file_size_display}</small></p>
</div>
"""
elif media_file.media_type == 'audio':
media_section += f"""
<div class="media-item">
<h4>音频</h4>
<audio controls>
<source src="/media/{media_file.local_file.name}" type="{media_file.mime_type}">
您的浏览器不支持音频播放。
</audio>
<p><small>原始URL: <a href="{media_file.original_url}" target="_blank">{media_file.original_url}</a></small></p>
<p><small>文件大小: {media_file.file_size_display}</small></p>
</div>
"""
else:
media_section += f"""
<div class="media-item">
<h4>文件: {media_file.get_media_type_display()}</h4>
<p><a href="/media/{media_file.local_file.name}" download>下载文件</a></p>
<p><small>原始URL: <a href="{media_file.original_url}" target="_blank">{media_file.original_url}</a></small></p>
<p><small>文件大小: {media_file.file_size_display}</small></p>
</div>
"""
media_section += " </div>"
# 处理内容格式,将换行符转换为段落和<br>标签
formatted_content = content.content.replace('\n\n', '</p><p>').replace('\n', '<br>')
# 动态生成预览页面
html_content = f"""
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>{content.title}</title>
<style>
body {{
font-family: Arial, sans-serif;
margin: 40px;
line-height: 1.6;
max-width: 1200px;
margin: 40px auto;
}}
h1 {{ color: #333; margin-bottom: 20px; }}
.meta {{
color: #666;
margin-bottom: 30px;
padding: 20px;
background-color: #f8f9fa;
border-radius: 8px;
border-left: 4px solid #007bff;
}}
.content {{
line-height: 1.8;
font-size: 16px;
margin-bottom: 30px;
}}
.content p {{
margin-bottom: 1em;
}}
.media-section {{
margin-top: 30px;
padding: 20px;
background-color: #f8f9fa;
border-radius: 8px;
}}
.media-item {{
margin-bottom: 20px;
padding: 15px;
border: 1px solid #ddd;
border-radius: 5px;
background-color: white;
}}
.media-item h4 {{
margin-top: 0;
color: #555;
border-bottom: 1px solid #eee;
padding-bottom: 10px;
}}
.back-link {{
margin-bottom: 20px;
}}
.back-link a {{
color: #007bff;
text-decoration: none;
font-weight: bold;
padding: 8px 16px;
background-color: #f8f9fa;
border: 1px solid #ddd;
border-radius: 4px;
}}
.back-link a:hover {{
text-decoration: underline;
background-color: #e9ecef;
}}
.navbar {{
background-color: #007bff;
padding: 15px;
margin-bottom: 30px;
border-radius: 8px;
}}
.navbar a {{
color: white;
text-decoration: none;
margin-right: 20px;
font-weight: bold;
}}
.navbar a:hover {{
text-decoration: underline;
}}
</style>
</head>
<body>
<div class="navbar">
<a href="/">仪表板</a>
<a href="/admin/crawler/crawledcontent/">管理界面</a>
<a href="javascript:history.back()">← 返回</a>
</div>
<h1>{content.title}</h1>
<div class="meta">
<p><strong>来源网站:</strong> {content.website.name} ({content.website.region})</p>
<p><strong>原始链接:</strong> <a href="{content.url}" target="_blank">{content.url}</a></p>
<p><strong>发布时间:</strong> {content.publish_date or '未知'}</p>
<p><strong>作者:</strong> {content.author or '未知'}</p>
<p><strong>匹配关键字:</strong> {content.keywords_matched}</p>
<p><strong>爬取时间:</strong> {content.created_at}</p>
<p><strong>媒体文件数量:</strong> {len(media_files)}</p>
</div>
<div class="content">
<p>{formatted_content}</p>
</div>
{media_section}
</body>
</html>
"""
return HttpResponse(html_content, content_type='text/html; charset=utf-8')

View File

@@ -0,0 +1,4 @@
# 这将确保Celery应用在Django启动时被加载
from .celery import app as celery_app
__all__ = ('celery_app',)

16
crawler_project/asgi.py Normal file
View File

@@ -0,0 +1,16 @@
"""
ASGI config for crawler_project project.
It exposes the ASGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/5.2/howto/deployment/asgi/
"""
import os
from django.core.asgi import get_asgi_application
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'crawler_project.settings')
application = get_asgi_application()

17
crawler_project/celery.py Normal file
View File

@@ -0,0 +1,17 @@
import os
from celery import Celery
# 设置Django设置模块
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'crawler_project.settings')
app = Celery('crawler_project')
# 使用Django设置文件配置Celery
app.config_from_object('django.conf:settings', namespace='CELERY')
# 自动发现任务
app.autodiscover_tasks()
@app.task(bind=True)
def debug_task(self):
print(f'Request: {self.request!r}')

181
crawler_project/settings.py Normal file
View File

@@ -0,0 +1,181 @@
"""
Django settings for crawler_project project.
Generated by 'django-admin startproject' using Django 5.2.6.
For more information on this file, see
https://docs.djangoproject.com/en/5.2/topics/settings/
For the full list of settings and their values, see
https://docs.djangoproject.com/en/5.2/ref/settings/
"""
from pathlib import Path
# Build paths inside the project like this: BASE_DIR / 'subdir'.
BASE_DIR = Path(__file__).resolve().parent.parent
# Quick-start development settings - unsuitable for production
# See https://docs.djangoproject.com/en/5.2/howto/deployment/checklist/
# SECURITY WARNING: keep the secret key used in production secret!
SECRET_KEY = 'django-insecure-w5lm159dl-)=z!dysfxf8!n^o26^6)4^!@5(yp*5-_c=!_tcq!'
# SECURITY WARNING: don't run with debug turned on in production!
DEBUG = True
ALLOWED_HOSTS = []
# Application definition
INSTALLED_APPS = [
'django.contrib.admin',
'django.contrib.auth',
'django.contrib.contenttypes',
'django.contrib.sessions',
'django.contrib.messages',
'django.contrib.staticfiles',
'crawler',
'rest_framework',
]
MIDDLEWARE = [
'django.middleware.security.SecurityMiddleware',
'django.contrib.sessions.middleware.SessionMiddleware',
'django.middleware.common.CommonMiddleware',
'django.middleware.csrf.CsrfViewMiddleware',
'django.contrib.auth.middleware.AuthenticationMiddleware',
'django.contrib.messages.middleware.MessageMiddleware',
'django.middleware.clickjacking.XFrameOptionsMiddleware',
]
ROOT_URLCONF = 'crawler_project.urls'
TEMPLATES = [
{
'BACKEND': 'django.template.backends.django.DjangoTemplates',
'DIRS': [],
'APP_DIRS': True,
'OPTIONS': {
'context_processors': [
'django.template.context_processors.request',
'django.contrib.auth.context_processors.auth',
'django.contrib.messages.context_processors.messages',
],
},
},
]
WSGI_APPLICATION = 'crawler_project.wsgi.application'
# Database
# https://docs.djangoproject.com/en/5.2/ref/settings/#databases
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.sqlite3',
'NAME': BASE_DIR / 'db.sqlite3',
}
}
# Password validation
# https://docs.djangoproject.com/en/5.2/ref/settings/#auth-password-validators
AUTH_PASSWORD_VALIDATORS = [
{
'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
},
]
# Internationalization
# https://docs.djangoproject.com/en/5.2/topics/i18n/
LANGUAGE_CODE = 'zh-hans'
TIME_ZONE = 'Asia/Shanghai'
USE_I18N = True
USE_TZ = True
# Static files (CSS, JavaScript, Images)
# https://docs.djangoproject.com/en/5.2/howto/static-files/
STATIC_URL = 'static/'
# Media files (用户上传的文件)
MEDIA_URL = '/media/'
MEDIA_ROOT = BASE_DIR / 'media'
# Default primary key field type
# https://docs.djangoproject.com/en/5.2/ref/settings/#default-auto-field
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
# Celery配置
CELERY_BROKER_URL = 'redis://localhost:6379/0'
CELERY_RESULT_BACKEND = 'redis://localhost:6379/0'
CELERY_ACCEPT_CONTENT = ['json']
CELERY_TASK_SERIALIZER = 'json'
CELERY_RESULT_SERIALIZER = 'json'
CELERY_TIMEZONE = TIME_ZONE
# 爬虫配置
CRAWLER_SETTINGS = {
'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'REQUEST_DELAY': 1, # 请求间隔(秒)
'TIMEOUT': 30, # 请求超时时间
'MAX_RETRIES': 3, # 最大重试次数
}
# 目标网站列表
TARGET_WEBSITES = [
{'name': '中共中央纪委', 'url': 'https://www.ccdi.gov.cn/', 'region': '中央'},
{'name': '北京纪检监察', 'url': 'https://www.bjsupervision.gov.cn/', 'region': '北京'},
{'name': '天津纪检监察', 'url': 'https://www.tjjw.gov.cn/', 'region': '天津'},
{'name': '河北纪检监察', 'url': 'http://www.hebcdi.gov.cn/', 'region': '河北'},
{'name': '山西纪检监察', 'url': 'http://www.sxdi.gov.cn/', 'region': '山西'},
{'name': '内蒙古纪检监察', 'url': 'https://www.nmgjjjc.gov.cn/', 'region': '内蒙古'},
{'name': '辽宁纪检监察', 'url': 'https://www.lnsjjjc.gov.cn/', 'region': '辽宁'},
{'name': '吉林纪检监察', 'url': 'http://ccdijl.gov.cn/', 'region': '吉林'},
{'name': '黑龙江纪检监察', 'url': 'https://www.hljjjjc.gov.cn/Hljjjjc/', 'region': '黑龙江'},
{'name': '上海纪检监察', 'url': 'https://www.shjjjc.gov.cn/', 'region': '上海'},
{'name': '江苏纪检监察', 'url': 'https://www.jssjw.gov.cn/', 'region': '江苏'},
{'name': '浙江纪检监察', 'url': 'https://www.zjsjw.gov.cn/shouye/', 'region': '浙江'},
{'name': '安徽纪检监察', 'url': 'http://www.ahjjjc.gov.cn/', 'region': '安徽'},
{'name': '福建纪检监察', 'url': 'https://www.fjcdi.gov.cn/cms/html/fjsjwjw/index.html', 'region': '福建'},
{'name': '江西纪检监察', 'url': 'http://www.jxdi.gov.cn/', 'region': '江西'},
{'name': '山东纪检监察', 'url': 'https://www.sdjj.gov.cn/', 'region': '山东'},
{'name': '河南纪检监察', 'url': 'https://www.hnsjw.gov.cn/sitesources/hnsjct/page_pc/index.html', 'region': '河南'},
{'name': '湖北纪检监察', 'url': 'https://www.hbjwjc.gov.cn/', 'region': '湖北'},
{'name': '湖南纪检监察', 'url': 'https://www.sxfj.gov.cn/', 'region': '湖南'},
{'name': '广东纪检监察', 'url': 'https://www.gdjct.gd.gov.cn/', 'region': '广东'},
{'name': '广西纪检监察', 'url': 'https://www.gxjjw.gov.cn/index.shtml', 'region': '广西'},
{'name': '海南纪检监察', 'url': 'https://www.hncdi.gov.cn/web/hnlzw/v2/html/index.jsp', 'region': '海南'},
{'name': '重庆纪检监察', 'url': 'https://jjc.cq.gov.cn/', 'region': '重庆'},
{'name': '四川纪检监察', 'url': 'https://www.scjc.gov.cn/', 'region': '四川'},
{'name': '贵州纪检监察', 'url': 'http://www.gzdis.gov.cn/', 'region': '贵州'},
{'name': '云南纪检监察', 'url': 'http://www.ynjjjc.gov.cn/', 'region': '云南'},
{'name': '西藏纪检监察', 'url': 'http://www.xzjjw.gov.cn/', 'region': '西藏'},
{'name': '陕西纪检监察', 'url': 'https://www.qinfeng.gov.cn/', 'region': '陕西'},
{'name': '甘肃纪检监察', 'url': 'http://www.gsjw.gov.cn/', 'region': '甘肃'},
{'name': '青海纪检监察', 'url': 'http://www.nxjjjc.gov.cn/', 'region': '青海'},
{'name': '宁夏纪检监察', 'url': 'http://www.qhjc.gov.cn/', 'region': '宁夏'},
{'name': '新疆纪检监察', 'url': 'https://www.xjjw.gov.cn/', 'region': '新疆'},
{'name': '新疆兵团纪检监察', 'url': 'http://btjw.xjbt.gov.cn/', 'region': '新疆兵团'},
]

29
crawler_project/urls.py Normal file
View File

@@ -0,0 +1,29 @@
"""
URL configuration for crawler_project project.
The `urlpatterns` list routes URLs to views. For more information please see:
https://docs.djangoproject.com/en/5.2/topics/http/urls/
Examples:
Function views
1. Add an import: from my_app import views
2. Add a URL to urlpatterns: path('', views.home, name='home')
Class-based views
1. Add an import: from other_app.views import Home
2. Add a URL to urlpatterns: path('', Home.as_view(), name='home')
Including another URLconf
1. Import the include() function: from django.urls import include, path
2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
"""
from django.contrib import admin
from django.urls import path, include
from django.conf import settings
from django.conf.urls.static import static
urlpatterns = [
path('admin/', admin.site.urls),
path('', include('crawler.urls')),
]
if settings.DEBUG:
urlpatterns += static(settings.STATIC_URL, document_root=settings.STATIC_ROOT)
urlpatterns += static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT)

16
crawler_project/wsgi.py Normal file
View File

@@ -0,0 +1,16 @@
"""
WSGI config for crawler_project project.
It exposes the WSGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/5.2/howto/deployment/wsgi/
"""
import os
from django.core.wsgi import get_wsgi_application
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'crawler_project.settings')
application = get_wsgi_application()

22
manage.py Executable file
View File

@@ -0,0 +1,22 @@
#!/usr/bin/env python
"""Django's command-line utility for administrative tasks."""
import os
import sys
def main():
"""Run administrative tasks."""
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'crawler_project.settings')
try:
from django.core.management import execute_from_command_line
except ImportError as exc:
raise ImportError(
"Couldn't import Django. Are you sure it's installed and "
"available on your PYTHONPATH environment variable? Did you "
"forget to activate a virtual environment?"
) from exc
execute_from_command_line(sys.argv)
if __name__ == '__main__':
main()

29
requirements.txt Normal file
View File

@@ -0,0 +1,29 @@
amqp==5.3.1
asgiref==3.9.1
beautifulsoup4==4.13.5
billiard==4.2.2
celery==5.5.3
certifi==2025.8.3
charset-normalizer==3.4.3
click==8.3.0
click-didyoumean==0.3.1
click-plugins==1.1.1.2
click-repl==0.3.0
Django==5.2.6
djangorestframework==3.15.2
idna==3.10
kombu==5.5.4
lxml==6.0.2
packaging==25.0
prompt_toolkit==3.0.52
python-dateutil==2.9.0.post0
redis==6.4.0
requests==2.32.5
six==1.17.0
soupsieve==2.8
sqlparse==0.5.3
typing_extensions==4.15.0
tzdata==2025.2
urllib3==2.5.0
vine==5.1.0
wcwidth==0.2.14

49
start.sh Executable file
View File

@@ -0,0 +1,49 @@
#!/bin/bash
echo "启动网站爬虫系统..."
# 检查是否在正确的目录
if [ ! -f "manage.py" ]; then
echo "错误: 请在项目根目录运行此脚本"
exit 1
fi
# 检查Python环境
if ! command -v python3 &> /dev/null; then
echo "错误: 未找到Python3"
exit 1
fi
# 安装依赖
echo "安装依赖..."
pip install -r requirements.txt
# 数据库迁移
echo "执行数据库迁移..."
python3 manage.py makemigrations
python3 manage.py migrate
# 初始化网站数据
echo "初始化网站数据..."
python3 manage.py init_websites
# 创建超级用户(如果不存在)
echo "检查超级用户..."
python3 manage.py shell -c "
from django.contrib.auth import get_user_model
User = get_user_model()
if not User.objects.filter(username='admin').exists():
User.objects.create_superuser('admin', 'admin@example.com', 'admin123')
print('创建超级用户: admin/admin123')
else:
print('超级用户已存在')
"
echo "启动Django服务器..."
echo "访问地址: http://localhost:8000"
echo "管理后台: http://localhost:8000/admin"
echo "用户名: admin, 密码: admin123"
echo ""
echo "按 Ctrl+C 停止服务器"
python3 manage.py runserver 0.0.0.0:8000

12
start_celery.sh Executable file
View File

@@ -0,0 +1,12 @@
#!/bin/bash
# 启动Celery Worker
echo "启动Celery Worker..."
celery -A crawler_project worker --loglevel=info --concurrency=4 &
# 启动Celery Beat (定时任务)
echo "启动Celery Beat..."
celery -A crawler_project beat --loglevel=info &
echo "Celery服务已启动"
echo "Worker PID: $!"