Base setup
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -166,7 +166,7 @@ cython_debug/
|
||||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
#.idea/
|
||||
.idea/
|
||||
|
||||
# Ruff stuff:
|
||||
.ruff_cache/
|
||||
|
||||
0
crawler/__init__.py
Normal file
0
crawler/__init__.py
Normal file
199
crawler/admin.py
Normal file
199
crawler/admin.py
Normal file
@@ -0,0 +1,199 @@
|
||||
from django.contrib import admin
|
||||
from django.utils.html import format_html
|
||||
from django.urls import reverse
|
||||
from django.utils.safestring import mark_safe
|
||||
from .models import Website, CrawlTask, CrawledContent, CrawlLog, SearchKeyword, MediaFile
|
||||
from .tasks import crawl_websites_task
|
||||
|
||||
|
||||
@admin.register(Website)
|
||||
class WebsiteAdmin(admin.ModelAdmin):
|
||||
list_display = ['name', 'region', 'url', 'is_active', 'created_at']
|
||||
list_filter = ['region', 'is_active', 'created_at']
|
||||
search_fields = ['name', 'url', 'region']
|
||||
list_editable = ['is_active']
|
||||
ordering = ['region', 'name']
|
||||
|
||||
|
||||
@admin.register(CrawlTask)
|
||||
class CrawlTaskAdmin(admin.ModelAdmin):
|
||||
list_display = ['name', 'status', 'created_by', 'progress_display', 'created_at', 'completed_at']
|
||||
list_filter = ['status', 'created_by', 'created_at']
|
||||
search_fields = ['name', 'keywords']
|
||||
readonly_fields = ['created_at', 'started_at', 'completed_at', 'progress_display']
|
||||
filter_horizontal = ['websites']
|
||||
actions = ['execute_crawl_task']
|
||||
|
||||
def progress_display(self, obj):
|
||||
"""显示任务进度"""
|
||||
if obj.status == 'completed':
|
||||
color = 'green'
|
||||
elif obj.status == 'failed':
|
||||
color = 'red'
|
||||
elif obj.status == 'running':
|
||||
color = 'orange'
|
||||
else:
|
||||
color = 'gray'
|
||||
|
||||
return format_html(
|
||||
'<span style="color: {};">{}%</span>',
|
||||
color,
|
||||
f'{obj.progress_percentage:.1f} ({obj.crawled_pages}/{obj.total_pages})'
|
||||
)
|
||||
progress_display.short_description = '进度'
|
||||
|
||||
def execute_crawl_task(self, request, queryset):
|
||||
"""执行选中的爬取任务"""
|
||||
for task in queryset:
|
||||
# 更新任务状态为pending
|
||||
task.status = 'pending'
|
||||
task.save()
|
||||
|
||||
# 异步执行爬取任务
|
||||
crawl_websites_task.delay(task.id)
|
||||
|
||||
self.message_user(request, f"已启动 {queryset.count()} 个爬取任务。")
|
||||
execute_crawl_task.short_description = "执行选中的爬取任务"
|
||||
|
||||
|
||||
@admin.register(CrawledContent)
|
||||
class CrawledContentAdmin(admin.ModelAdmin):
|
||||
list_display = ['title_short', 'website', 'task', 'keywords_matched', 'media_count', 'publish_date', 'is_local_saved', 'created_at']
|
||||
list_filter = ['website', 'task', 'created_at', 'publish_date', 'is_local_saved']
|
||||
search_fields = ['title', 'content', 'keywords_matched']
|
||||
readonly_fields = ['created_at', 'preview_content', 'media_files_display']
|
||||
ordering = ['-created_at']
|
||||
|
||||
def title_short(self, obj):
|
||||
"""显示缩短的标题"""
|
||||
return obj.title[:50] + '...' if len(obj.title) > 50 else obj.title
|
||||
title_short.short_description = '标题'
|
||||
|
||||
def media_count(self, obj):
|
||||
"""显示媒体文件数量"""
|
||||
count = obj.media_files.count()
|
||||
if count > 0:
|
||||
return format_html(
|
||||
'<span style="color: green; font-weight: bold;">{}</span>',
|
||||
count
|
||||
)
|
||||
return "0"
|
||||
media_count.short_description = '媒体文件'
|
||||
|
||||
def preview_content(self, obj):
|
||||
"""预览内容"""
|
||||
if obj.is_local_saved:
|
||||
url = reverse('admin:crawled_content_preview', args=[obj.id])
|
||||
return format_html(
|
||||
'<a href="{}" target="_blank" class="button">预览文章</a>',
|
||||
url
|
||||
)
|
||||
elif obj.content:
|
||||
return format_html(
|
||||
'<div style="max-height: 200px; overflow-y: auto; border: 1px solid #ddd; padding: 10px;">{}</div>',
|
||||
obj.get_preview_content(500)
|
||||
)
|
||||
else:
|
||||
return "无内容"
|
||||
preview_content.short_description = '内容预览'
|
||||
|
||||
def media_files_display(self, obj):
|
||||
"""显示媒体文件列表"""
|
||||
media_files = obj.media_files.all()
|
||||
if not media_files:
|
||||
return "无媒体文件"
|
||||
|
||||
html = "<div style='max-height: 300px; overflow-y: auto;'>"
|
||||
for media_file in media_files:
|
||||
if media_file.media_type == 'image':
|
||||
html += format_html(
|
||||
'<div style="margin-bottom: 10px; border: 1px solid #ddd; padding: 5px;">'
|
||||
'<strong>图片:</strong> {}<br>'
|
||||
'<img src="/media/{}" style="max-width: 150px; max-height: 150px;" /><br>'
|
||||
'<small>大小: {}</small>'
|
||||
'</div>',
|
||||
media_file.alt_text or '无标题',
|
||||
media_file.local_file.name,
|
||||
media_file.file_size_display
|
||||
)
|
||||
elif media_file.media_type == 'video':
|
||||
html += format_html(
|
||||
'<div style="margin-bottom: 10px; border: 1px solid #ddd; padding: 5px;">'
|
||||
'<strong>视频:</strong><br>'
|
||||
'<video controls style="max-width: 200px; max-height: 150px;">'
|
||||
'<source src="/media/{}" type="{}">'
|
||||
'</video><br>'
|
||||
'<small>大小: {}</small>'
|
||||
'</div>',
|
||||
media_file.local_file.name,
|
||||
media_file.mime_type,
|
||||
media_file.file_size_display
|
||||
)
|
||||
else:
|
||||
html += format_html(
|
||||
'<div style="margin-bottom: 10px; border: 1px solid #ddd; padding: 5px;">'
|
||||
'<strong>{}:</strong> <a href="/media/{}" download>下载</a><br>'
|
||||
'<small>大小: {}</small>'
|
||||
'</div>',
|
||||
media_file.get_media_type_display(),
|
||||
media_file.local_file.name,
|
||||
media_file.file_size_display
|
||||
)
|
||||
html += "</div>"
|
||||
return format_html(html)
|
||||
media_files_display.short_description = '媒体文件'
|
||||
|
||||
|
||||
@admin.register(CrawlLog)
|
||||
class CrawlLogAdmin(admin.ModelAdmin):
|
||||
list_display = ['level', 'message_short', 'website', 'task', 'created_at']
|
||||
list_filter = ['level', 'website', 'task', 'created_at']
|
||||
search_fields = ['message']
|
||||
readonly_fields = ['created_at']
|
||||
ordering = ['-created_at']
|
||||
|
||||
def message_short(self, obj):
|
||||
"""显示缩短的消息"""
|
||||
return obj.message[:100] + '...' if len(obj.message) > 100 else obj.message
|
||||
message_short.short_description = '消息'
|
||||
|
||||
|
||||
@admin.register(MediaFile)
|
||||
class MediaFileAdmin(admin.ModelAdmin):
|
||||
list_display = ['content', 'media_type', 'file_size_display', 'mime_type', 'created_at']
|
||||
list_filter = ['media_type', 'created_at']
|
||||
search_fields = ['content__title', 'original_url', 'alt_text']
|
||||
readonly_fields = ['created_at', 'file_size_display', 'media_preview']
|
||||
ordering = ['-created_at']
|
||||
|
||||
def media_preview(self, obj):
|
||||
"""媒体文件预览"""
|
||||
if obj.media_type == 'image' and obj.local_file:
|
||||
return format_html(
|
||||
'<img src="/media/{}" style="max-width: 200px; max-height: 200px;" />',
|
||||
obj.local_file.name
|
||||
)
|
||||
elif obj.media_type == 'video' and obj.local_file:
|
||||
return format_html(
|
||||
'<video controls style="max-width: 200px; max-height: 200px;"><source src="/media/{}" type="{}"></video>',
|
||||
obj.local_file.name,
|
||||
obj.mime_type
|
||||
)
|
||||
elif obj.media_type == 'audio' and obj.local_file:
|
||||
return format_html(
|
||||
'<audio controls><source src="/media/{}" type="{}"></audio>',
|
||||
obj.local_file.name,
|
||||
obj.mime_type
|
||||
)
|
||||
else:
|
||||
return "无预览"
|
||||
media_preview.short_description = '预览'
|
||||
|
||||
|
||||
@admin.register(SearchKeyword)
|
||||
class SearchKeywordAdmin(admin.ModelAdmin):
|
||||
list_display = ['keyword', 'is_active', 'created_at', 'last_used']
|
||||
list_filter = ['is_active', 'created_at', 'last_used']
|
||||
search_fields = ['keyword']
|
||||
list_editable = ['is_active']
|
||||
ordering = ['-last_used', '-created_at']
|
||||
6
crawler/apps.py
Normal file
6
crawler/apps.py
Normal file
@@ -0,0 +1,6 @@
|
||||
from django.apps import AppConfig
|
||||
|
||||
|
||||
class CrawlerConfig(AppConfig):
|
||||
default_auto_field = 'django.db.models.BigAutoField'
|
||||
name = 'crawler'
|
||||
578
crawler/crawler_engine.py
Normal file
578
crawler/crawler_engine.py
Normal file
@@ -0,0 +1,578 @@
|
||||
import requests
|
||||
import time
|
||||
import re
|
||||
import logging
|
||||
import os
|
||||
import urllib3
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urljoin, urlparse
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
from django.core.files.base import ContentFile
|
||||
from requests.adapters import HTTPAdapter
|
||||
from urllib3.util.retry import Retry
|
||||
from .models import Website, CrawlTask, CrawledContent, CrawlLog, SearchKeyword, MediaFile
|
||||
|
||||
# 禁用SSL警告
|
||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
|
||||
# 设置日志记录器
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class WebsiteCrawler:
|
||||
"""网站爬虫引擎"""
|
||||
|
||||
def __init__(self, task_id):
|
||||
self.task = CrawlTask.objects.get(id=task_id)
|
||||
self.keywords = [kw.strip() for kw in self.task.keywords.split(',') if kw.strip()]
|
||||
|
||||
# 创建带重试策略的会话
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update({
|
||||
'User-Agent': settings.CRAWLER_SETTINGS['USER_AGENT']
|
||||
})
|
||||
|
||||
# 设置重试策略
|
||||
retry_strategy = Retry(
|
||||
total=settings.CRAWLER_SETTINGS.get('MAX_RETRIES', 3),
|
||||
backoff_factor=1,
|
||||
status_forcelist=[429, 500, 502, 503, 504],
|
||||
)
|
||||
adapter = HTTPAdapter(max_retries=retry_strategy)
|
||||
self.session.mount("http://", adapter)
|
||||
self.session.mount("https://", adapter)
|
||||
|
||||
# 设置超时
|
||||
self.timeout = settings.CRAWLER_SETTINGS['TIMEOUT']
|
||||
|
||||
def log(self, level, message, website=None):
|
||||
"""记录日志"""
|
||||
CrawlLog.objects.create(
|
||||
task=self.task,
|
||||
website=website,
|
||||
level=level,
|
||||
message=message
|
||||
)
|
||||
# 同时记录到Python日志系统
|
||||
logger.log(getattr(logging, level.upper()), f"Task {self.task.id}: {message}")
|
||||
|
||||
def update_task_status(self, status, **kwargs):
|
||||
"""更新任务状态"""
|
||||
self.task.status = status
|
||||
if status == 'running' and not self.task.started_at:
|
||||
self.task.started_at = timezone.now()
|
||||
elif status in ['completed', 'failed', 'cancelled']:
|
||||
self.task.completed_at = timezone.now()
|
||||
|
||||
for key, value in kwargs.items():
|
||||
setattr(self.task, key, value)
|
||||
self.task.save()
|
||||
|
||||
def extract_text_content(self, soup):
|
||||
"""提取文本内容,保持段落结构"""
|
||||
# 移除脚本和样式标签
|
||||
for script in soup(["script", "style"]):
|
||||
script.decompose()
|
||||
|
||||
# 处理段落标签,保持段落结构
|
||||
paragraphs = []
|
||||
|
||||
# 查找所有段落相关的标签
|
||||
for element in soup.find_all(['p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'br']):
|
||||
if element.name in ['p', 'div']:
|
||||
text = element.get_text().strip()
|
||||
if text:
|
||||
paragraphs.append(text)
|
||||
elif element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
||||
text = element.get_text().strip()
|
||||
if text:
|
||||
paragraphs.append(f"\n{text}\n") # 标题前后加换行
|
||||
elif element.name == 'br':
|
||||
paragraphs.append('\n')
|
||||
|
||||
# 如果没有找到段落标签,使用原来的方法
|
||||
if not paragraphs:
|
||||
text = soup.get_text()
|
||||
# 清理文本但保持换行
|
||||
lines = []
|
||||
for line in text.splitlines():
|
||||
line = line.strip()
|
||||
if line:
|
||||
lines.append(line)
|
||||
return '\n\n'.join(lines)
|
||||
|
||||
# 合并段落,用双换行分隔
|
||||
content = '\n\n'.join(paragraphs)
|
||||
|
||||
# 清理多余的空行
|
||||
import re
|
||||
content = re.sub(r'\n\s*\n\s*\n', '\n\n', content)
|
||||
|
||||
return content.strip()
|
||||
|
||||
def find_article_links(self, soup, base_url):
|
||||
"""查找文章链接"""
|
||||
links = []
|
||||
|
||||
# 常见的文章链接选择器
|
||||
selectors = [
|
||||
'a[href*="article"]',
|
||||
'a[href*="news"]',
|
||||
'a[href*="content"]',
|
||||
'a[href*="detail"]',
|
||||
'a[href*="view"]',
|
||||
'a[href*="show"]',
|
||||
'.news-list a',
|
||||
'.article-list a',
|
||||
'.content-list a',
|
||||
'h3 a',
|
||||
'h4 a',
|
||||
'.title a',
|
||||
'.list-item a'
|
||||
]
|
||||
|
||||
for selector in selectors:
|
||||
elements = soup.select(selector)
|
||||
for element in elements:
|
||||
href = element.get('href')
|
||||
if href:
|
||||
full_url = urljoin(base_url, href)
|
||||
title = element.get_text().strip()
|
||||
if title and len(title) > 5: # 过滤掉太短的标题
|
||||
links.append({
|
||||
'url': full_url,
|
||||
'title': title
|
||||
})
|
||||
|
||||
return links
|
||||
|
||||
def check_keyword_match(self, text, title):
|
||||
"""检查关键字匹配"""
|
||||
matched_keywords = []
|
||||
text_lower = text.lower()
|
||||
title_lower = title.lower()
|
||||
|
||||
for keyword in self.keywords:
|
||||
keyword_lower = keyword.lower()
|
||||
if keyword_lower in text_lower or keyword_lower in title_lower:
|
||||
matched_keywords.append(keyword)
|
||||
|
||||
return matched_keywords
|
||||
|
||||
def extract_article_content(self, url, soup):
|
||||
"""提取文章内容"""
|
||||
# 尝试多种内容选择器
|
||||
content_selectors = [
|
||||
'.article-content',
|
||||
'.content',
|
||||
'.article-body',
|
||||
'.news-content',
|
||||
'.main-content',
|
||||
'.post-content',
|
||||
'article',
|
||||
'.detail-content',
|
||||
'#content',
|
||||
'.text'
|
||||
]
|
||||
|
||||
content = ""
|
||||
for selector in content_selectors:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
content = self.extract_text_content(element)
|
||||
if len(content) > 100: # 确保内容足够长
|
||||
break
|
||||
|
||||
# 如果没找到特定内容区域,使用整个页面
|
||||
if not content or len(content) < 100:
|
||||
content = self.extract_text_content(soup)
|
||||
|
||||
return content
|
||||
|
||||
def extract_publish_date(self, soup):
|
||||
"""提取发布时间"""
|
||||
date_selectors = [
|
||||
'.publish-time',
|
||||
'.pub-time',
|
||||
'.date',
|
||||
'.time',
|
||||
'.publish-date',
|
||||
'time[datetime]',
|
||||
'.article-time',
|
||||
'.news-time'
|
||||
]
|
||||
|
||||
for selector in date_selectors:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
date_text = element.get_text().strip()
|
||||
if element.get('datetime'):
|
||||
date_text = element.get('datetime')
|
||||
|
||||
# 尝试解析日期
|
||||
try:
|
||||
from datetime import datetime
|
||||
# 这里可以添加更复杂的日期解析逻辑
|
||||
# 暂时返回当前时间
|
||||
return timezone.now()
|
||||
except:
|
||||
continue
|
||||
|
||||
return None
|
||||
|
||||
def extract_author(self, soup):
|
||||
"""提取作者信息"""
|
||||
author_selectors = [
|
||||
'.author',
|
||||
'.writer',
|
||||
'.publisher',
|
||||
'.byline',
|
||||
'.article-author',
|
||||
'.news-author'
|
||||
]
|
||||
|
||||
for selector in author_selectors:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
return element.get_text().strip()
|
||||
|
||||
return ""
|
||||
|
||||
def download_media_file(self, media_url, crawled_content, media_type='image', alt_text=''):
|
||||
"""下载媒体文件"""
|
||||
try:
|
||||
# 检查URL是否有效
|
||||
if not media_url or not media_url.startswith(('http://', 'https://')):
|
||||
return None
|
||||
|
||||
# 请求媒体文件
|
||||
response = self.session.get(
|
||||
media_url,
|
||||
timeout=self.timeout,
|
||||
verify=False,
|
||||
stream=False # 改为False以确保获取完整内容
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
# 获取文件信息
|
||||
content_type = response.headers.get('content-type', '')
|
||||
content_length = response.headers.get('content-length')
|
||||
file_size = int(content_length) if content_length else len(response.content)
|
||||
|
||||
# 确定文件扩展名
|
||||
file_extension = self.get_file_extension_from_url(media_url, content_type)
|
||||
|
||||
# 生成文件名
|
||||
filename = f"media_{crawled_content.id}_{len(crawled_content.media_files.all())}{file_extension}"
|
||||
|
||||
# 创建媒体文件对象
|
||||
media_file = MediaFile.objects.create(
|
||||
content=crawled_content,
|
||||
media_type=media_type,
|
||||
original_url=media_url,
|
||||
file_size=file_size,
|
||||
mime_type=content_type,
|
||||
alt_text=alt_text
|
||||
)
|
||||
|
||||
# 保存文件
|
||||
media_file.local_file.save(
|
||||
filename,
|
||||
ContentFile(response.content),
|
||||
save=True
|
||||
)
|
||||
|
||||
self.log('info', f'媒体文件已下载: {filename} ({media_type})', crawled_content.website)
|
||||
return media_file
|
||||
|
||||
except Exception as e:
|
||||
self.log('error', f'下载媒体文件失败 {media_url}: {str(e)}', crawled_content.website)
|
||||
return None
|
||||
|
||||
def get_file_extension_from_url(self, url, content_type):
|
||||
"""从URL或内容类型获取文件扩展名"""
|
||||
# 从URL获取扩展名
|
||||
parsed_url = urlparse(url)
|
||||
path = parsed_url.path
|
||||
if '.' in path:
|
||||
return os.path.splitext(path)[1]
|
||||
|
||||
# 从内容类型获取扩展名
|
||||
content_type_map = {
|
||||
'image/jpeg': '.jpg',
|
||||
'image/jpg': '.jpg',
|
||||
'image/png': '.png',
|
||||
'image/gif': '.gif',
|
||||
'image/webp': '.webp',
|
||||
'image/svg+xml': '.svg',
|
||||
'video/mp4': '.mp4',
|
||||
'video/avi': '.avi',
|
||||
'video/mov': '.mov',
|
||||
'video/wmv': '.wmv',
|
||||
'video/flv': '.flv',
|
||||
'video/webm': '.webm',
|
||||
'audio/mp3': '.mp3',
|
||||
'audio/wav': '.wav',
|
||||
'audio/ogg': '.ogg',
|
||||
'application/pdf': '.pdf',
|
||||
'application/msword': '.doc',
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
|
||||
}
|
||||
|
||||
return content_type_map.get(content_type.lower(), '.bin')
|
||||
|
||||
def extract_and_download_media(self, soup, crawled_content, base_url):
|
||||
"""提取并下载页面中的媒体文件"""
|
||||
media_files = []
|
||||
|
||||
# 提取图片
|
||||
images = soup.find_all('img')
|
||||
self.log('info', f'找到 {len(images)} 个图片标签', crawled_content.website)
|
||||
|
||||
for img in images:
|
||||
src = img.get('src')
|
||||
if src:
|
||||
# 处理相对URL
|
||||
if src.startswith('//'):
|
||||
src = 'https:' + src
|
||||
elif src.startswith('/'):
|
||||
src = urljoin(base_url, src)
|
||||
elif not src.startswith(('http://', 'https://')):
|
||||
src = urljoin(base_url, src)
|
||||
|
||||
alt_text = img.get('alt', '')
|
||||
self.log('info', f'尝试下载图片: {src}', crawled_content.website)
|
||||
media_file = self.download_media_file(src, crawled_content, 'image', alt_text)
|
||||
if media_file:
|
||||
media_files.append(media_file)
|
||||
self.log('info', f'成功下载图片: {media_file.local_file.name}', crawled_content.website)
|
||||
|
||||
# 提取视频
|
||||
videos = soup.find_all(['video', 'source'])
|
||||
for video in videos:
|
||||
src = video.get('src')
|
||||
if src:
|
||||
# 处理相对URL
|
||||
if src.startswith('//'):
|
||||
src = 'https:' + src
|
||||
elif src.startswith('/'):
|
||||
src = urljoin(base_url, src)
|
||||
elif not src.startswith(('http://', 'https://')):
|
||||
src = urljoin(base_url, src)
|
||||
|
||||
media_file = self.download_media_file(src, crawled_content, 'video')
|
||||
if media_file:
|
||||
media_files.append(media_file)
|
||||
|
||||
# 提取音频
|
||||
audios = soup.find_all('audio')
|
||||
for audio in audios:
|
||||
src = audio.get('src')
|
||||
if src:
|
||||
# 处理相对URL
|
||||
if src.startswith('//'):
|
||||
src = 'https:' + src
|
||||
elif src.startswith('/'):
|
||||
src = urljoin(base_url, src)
|
||||
elif not src.startswith(('http://', 'https://')):
|
||||
src = urljoin(base_url, src)
|
||||
|
||||
media_file = self.download_media_file(src, crawled_content, 'audio')
|
||||
if media_file:
|
||||
media_files.append(media_file)
|
||||
|
||||
return media_files
|
||||
|
||||
def mark_content_saved(self, crawled_content):
|
||||
"""标记内容已保存(内容已存储在数据库中)"""
|
||||
try:
|
||||
crawled_content.is_local_saved = True
|
||||
crawled_content.save()
|
||||
|
||||
media_count = crawled_content.media_files.count()
|
||||
self.log('info', f'文章内容已保存到数据库 (包含 {media_count} 个媒体文件)', crawled_content.website)
|
||||
return True
|
||||
except Exception as e:
|
||||
self.log('error', f'标记内容保存状态失败: {str(e)}', crawled_content.website)
|
||||
return False
|
||||
|
||||
def crawl_website(self, website):
|
||||
"""爬取单个网站"""
|
||||
self.log('info', f'开始爬取网站: {website.name}', website)
|
||||
|
||||
try:
|
||||
# 请求主页
|
||||
response = self.session.get(
|
||||
website.url,
|
||||
timeout=self.timeout,
|
||||
verify=False # 忽略SSL证书验证
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
# 检查内容编码
|
||||
if response.encoding != 'utf-8':
|
||||
# 尝试从响应头获取编码
|
||||
content_type = response.headers.get('content-type', '')
|
||||
if 'charset=' in content_type:
|
||||
charset = content_type.split('charset=')[-1]
|
||||
response.encoding = charset
|
||||
else:
|
||||
response.encoding = 'utf-8'
|
||||
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
|
||||
# 查找文章链接
|
||||
article_links = self.find_article_links(soup, website.url)
|
||||
self.log('info', f'找到 {len(article_links)} 个文章链接', website)
|
||||
|
||||
crawled_count = 0
|
||||
for link_info in article_links:
|
||||
try:
|
||||
# 请求文章页面
|
||||
article_response = self.session.get(
|
||||
link_info['url'],
|
||||
timeout=self.timeout,
|
||||
verify=False # 忽略SSL证书验证
|
||||
)
|
||||
article_response.raise_for_status()
|
||||
|
||||
# 检查内容编码
|
||||
if article_response.encoding != 'utf-8':
|
||||
# 尝试从响应头获取编码
|
||||
content_type = article_response.headers.get('content-type', '')
|
||||
if 'charset=' in content_type:
|
||||
charset = content_type.split('charset=')[-1]
|
||||
article_response.encoding = charset
|
||||
else:
|
||||
article_response.encoding = 'utf-8'
|
||||
|
||||
article_soup = BeautifulSoup(article_response.content, 'html.parser')
|
||||
|
||||
# 提取内容
|
||||
content = self.extract_article_content(link_info['url'], article_soup)
|
||||
title = link_info['title']
|
||||
|
||||
# 检查关键字匹配
|
||||
matched_keywords = self.check_keyword_match(content, title)
|
||||
|
||||
if matched_keywords:
|
||||
# 提取其他信息
|
||||
publish_date = self.extract_publish_date(article_soup)
|
||||
author = self.extract_author(article_soup)
|
||||
|
||||
# 保存内容
|
||||
crawled_content = CrawledContent.objects.create(
|
||||
task=self.task,
|
||||
website=website,
|
||||
title=title,
|
||||
content=content,
|
||||
url=link_info['url'],
|
||||
publish_date=publish_date,
|
||||
author=author,
|
||||
keywords_matched=','.join(matched_keywords),
|
||||
is_local_saved=False # 初始设置为False,保存到本地后会更新为True
|
||||
)
|
||||
|
||||
# 提取并下载媒体文件
|
||||
media_files = self.extract_and_download_media(article_soup, crawled_content, link_info['url'])
|
||||
|
||||
# 标记内容已保存
|
||||
self.mark_content_saved(crawled_content)
|
||||
|
||||
crawled_count += 1
|
||||
self.log('info', f'保存文章: {title[:50]}...', website)
|
||||
|
||||
# 请求间隔
|
||||
time.sleep(settings.CRAWLER_SETTINGS['REQUEST_DELAY'])
|
||||
|
||||
except requests.exceptions.SSLError as e:
|
||||
self.log('error', f'SSL错误,跳过文章 {link_info["url"]}: {str(e)}', website)
|
||||
continue
|
||||
except requests.exceptions.ConnectionError as e:
|
||||
self.log('error', f'连接错误,跳过文章 {link_info["url"]}: {str(e)}', website)
|
||||
continue
|
||||
except requests.exceptions.Timeout as e:
|
||||
self.log('error', f'请求超时,跳过文章 {link_info["url"]}: {str(e)}', website)
|
||||
continue
|
||||
except requests.exceptions.RequestException as e:
|
||||
self.log('error', f'网络请求错误,跳过文章 {link_info["url"]}: {str(e)}', website)
|
||||
continue
|
||||
except UnicodeDecodeError as e:
|
||||
self.log('error', f'字符编码错误,跳过文章 {link_info["url"]}: {str(e)}', website)
|
||||
continue
|
||||
except Exception as e:
|
||||
self.log('error', f'处理文章失败 {link_info["url"]}: {str(e)}', website)
|
||||
continue
|
||||
|
||||
self.log('info', f'网站爬取完成,共保存 {crawled_count} 篇文章', website)
|
||||
return crawled_count
|
||||
|
||||
except requests.exceptions.SSLError as e:
|
||||
self.log('error', f'爬取网站SSL错误: {str(e)}', website)
|
||||
return 0
|
||||
except requests.exceptions.ConnectionError as e:
|
||||
self.log('error', f'爬取网站连接错误: {str(e)}', website)
|
||||
return 0
|
||||
except requests.exceptions.Timeout as e:
|
||||
self.log('error', f'爬取网站超时: {str(e)}', website)
|
||||
return 0
|
||||
except requests.exceptions.RequestException as e:
|
||||
self.log('error', f'爬取网站网络错误: {str(e)}', website)
|
||||
return 0
|
||||
except Exception as e:
|
||||
self.log('error', f'爬取网站失败: {str(e)}', website)
|
||||
return 0
|
||||
|
||||
def run(self):
|
||||
"""运行爬取任务"""
|
||||
self.log('info', f'开始执行爬取任务: {self.task.name}')
|
||||
self.update_task_status('running')
|
||||
|
||||
total_crawled = 0
|
||||
websites = self.task.websites.filter(is_active=True)
|
||||
self.task.total_pages = websites.count()
|
||||
self.task.save()
|
||||
|
||||
for website in websites:
|
||||
try:
|
||||
crawled_count = self.crawl_website(website)
|
||||
total_crawled += crawled_count
|
||||
self.task.crawled_pages += 1
|
||||
self.task.save()
|
||||
|
||||
except Exception as e:
|
||||
self.log('error', f'爬取网站 {website.name} 时发生错误: {str(e)}', website)
|
||||
continue
|
||||
|
||||
# 更新任务状态
|
||||
if total_crawled > 0:
|
||||
self.update_task_status('completed')
|
||||
self.log('info', f'爬取任务完成,共爬取 {total_crawled} 篇文章')
|
||||
else:
|
||||
self.update_task_status('failed', error_message='没有找到匹配的内容')
|
||||
self.log('error', '爬取任务失败,没有找到匹配的内容')
|
||||
|
||||
|
||||
def run_crawl_task(task_id):
|
||||
"""运行爬取任务(Celery任务)"""
|
||||
try:
|
||||
crawler = WebsiteCrawler(task_id)
|
||||
crawler.run()
|
||||
return f"任务 {task_id} 执行完成"
|
||||
except Exception as e:
|
||||
# 记录异常到日志
|
||||
logger.error(f"执行任务 {task_id} 时发生异常: {str(e)}", exc_info=True)
|
||||
|
||||
task = CrawlTask.objects.get(id=task_id)
|
||||
task.status = 'failed'
|
||||
task.error_message = str(e)
|
||||
task.completed_at = timezone.now()
|
||||
task.save()
|
||||
|
||||
CrawlLog.objects.create(
|
||||
task=task,
|
||||
level='error',
|
||||
message=f'任务执行失败: {str(e)}'
|
||||
)
|
||||
return f"任务 {task_id} 执行失败: {str(e)}"
|
||||
0
crawler/management/__init__.py
Normal file
0
crawler/management/__init__.py
Normal file
0
crawler/management/commands/__init__.py
Normal file
0
crawler/management/commands/__init__.py
Normal file
36
crawler/management/commands/init_websites.py
Normal file
36
crawler/management/commands/init_websites.py
Normal file
@@ -0,0 +1,36 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
from django.conf import settings
|
||||
from crawler.models import Website
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = '初始化目标网站数据'
|
||||
|
||||
def handle(self, *args, **options):
|
||||
self.stdout.write('开始初始化目标网站数据...')
|
||||
|
||||
# 清空现有数据
|
||||
Website.objects.all().delete()
|
||||
|
||||
# 从设置中获取网站列表
|
||||
websites_data = settings.TARGET_WEBSITES
|
||||
|
||||
created_count = 0
|
||||
for website_data in websites_data:
|
||||
website, created = Website.objects.get_or_create(
|
||||
url=website_data['url'],
|
||||
defaults={
|
||||
'name': website_data['name'],
|
||||
'region': website_data['region'],
|
||||
'is_active': True
|
||||
}
|
||||
)
|
||||
if created:
|
||||
created_count += 1
|
||||
self.stdout.write(f'创建网站: {website.name}')
|
||||
else:
|
||||
self.stdout.write(f'网站已存在: {website.name}')
|
||||
|
||||
self.stdout.write(
|
||||
self.style.SUCCESS(f'成功初始化 {created_count} 个网站')
|
||||
)
|
||||
69
crawler/management/commands/run_crawler.py
Normal file
69
crawler/management/commands/run_crawler.py
Normal file
@@ -0,0 +1,69 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
from crawler.models import CrawlTask, Website
|
||||
from crawler.tasks import crawl_websites_task
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = '运行爬虫任务'
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument(
|
||||
'--keywords',
|
||||
type=str,
|
||||
required=True,
|
||||
help='搜索关键字,多个关键字用逗号分隔'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--websites',
|
||||
type=str,
|
||||
help='网站ID列表,用逗号分隔。不指定则爬取所有网站'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--name',
|
||||
type=str,
|
||||
help='任务名称'
|
||||
)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
keywords = options['keywords']
|
||||
website_ids = options.get('websites')
|
||||
task_name = options.get('name', f'关键字搜索: {keywords}')
|
||||
|
||||
# 获取目标网站
|
||||
if website_ids:
|
||||
website_id_list = [int(id.strip()) for id in website_ids.split(',')]
|
||||
websites = Website.objects.filter(id__in=website_id_list, is_active=True)
|
||||
else:
|
||||
websites = Website.objects.filter(is_active=True)
|
||||
|
||||
if not websites.exists():
|
||||
self.stdout.write(
|
||||
self.style.ERROR('没有找到可用的网站')
|
||||
)
|
||||
return
|
||||
|
||||
# 创建任务
|
||||
task = CrawlTask.objects.create(
|
||||
name=task_name,
|
||||
keywords=keywords,
|
||||
created_by='management_command'
|
||||
)
|
||||
task.websites.set(websites)
|
||||
|
||||
self.stdout.write(f'创建任务: {task.name}')
|
||||
self.stdout.write(f'目标网站: {websites.count()} 个')
|
||||
self.stdout.write(f'搜索关键字: {keywords}')
|
||||
|
||||
# 启动任务(同步模式,如果没有Redis则直接运行)
|
||||
try:
|
||||
crawl_websites_task.delay(task.id)
|
||||
self.stdout.write('任务已提交到队列')
|
||||
except Exception as e:
|
||||
self.stdout.write(f'队列不可用,直接运行任务: {e}')
|
||||
from crawler.crawler_engine import WebsiteCrawler
|
||||
crawler = WebsiteCrawler(task.id)
|
||||
crawler.run()
|
||||
|
||||
self.stdout.write(
|
||||
self.style.SUCCESS(f'任务已启动,任务ID: {task.id}')
|
||||
)
|
||||
106
crawler/migrations/0001_initial.py
Normal file
106
crawler/migrations/0001_initial.py
Normal file
@@ -0,0 +1,106 @@
|
||||
# Generated by Django 5.2.6 on 2025-09-22 16:27
|
||||
|
||||
import django.db.models.deletion
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
initial = True
|
||||
|
||||
dependencies = [
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='SearchKeyword',
|
||||
fields=[
|
||||
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('keyword', models.CharField(max_length=100, unique=True, verbose_name='关键字')),
|
||||
('is_active', models.BooleanField(default=True, verbose_name='是否启用')),
|
||||
('created_at', models.DateTimeField(auto_now_add=True, verbose_name='创建时间')),
|
||||
('last_used', models.DateTimeField(blank=True, null=True, verbose_name='最后使用时间')),
|
||||
],
|
||||
options={
|
||||
'verbose_name': '搜索关键字',
|
||||
'verbose_name_plural': '搜索关键字',
|
||||
'ordering': ['-last_used', '-created_at'],
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='Website',
|
||||
fields=[
|
||||
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('name', models.CharField(max_length=100, verbose_name='网站名称')),
|
||||
('url', models.URLField(verbose_name='网站地址')),
|
||||
('region', models.CharField(max_length=50, verbose_name='所属地区')),
|
||||
('is_active', models.BooleanField(default=True, verbose_name='是否启用')),
|
||||
('created_at', models.DateTimeField(auto_now_add=True, verbose_name='创建时间')),
|
||||
('updated_at', models.DateTimeField(auto_now=True, verbose_name='更新时间')),
|
||||
],
|
||||
options={
|
||||
'verbose_name': '目标网站',
|
||||
'verbose_name_plural': '目标网站',
|
||||
'ordering': ['region', 'name'],
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='CrawlTask',
|
||||
fields=[
|
||||
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('name', models.CharField(max_length=200, verbose_name='任务名称')),
|
||||
('keywords', models.TextField(help_text='多个关键字用逗号分隔', verbose_name='搜索关键字')),
|
||||
('status', models.CharField(choices=[('pending', '待执行'), ('running', '执行中'), ('completed', '已完成'), ('failed', '执行失败'), ('cancelled', '已取消')], default='pending', max_length=20, verbose_name='任务状态')),
|
||||
('created_by', models.CharField(default='system', max_length=100, verbose_name='创建者')),
|
||||
('created_at', models.DateTimeField(auto_now_add=True, verbose_name='创建时间')),
|
||||
('started_at', models.DateTimeField(blank=True, null=True, verbose_name='开始时间')),
|
||||
('completed_at', models.DateTimeField(blank=True, null=True, verbose_name='完成时间')),
|
||||
('error_message', models.TextField(blank=True, verbose_name='错误信息')),
|
||||
('total_pages', models.IntegerField(default=0, verbose_name='总页数')),
|
||||
('crawled_pages', models.IntegerField(default=0, verbose_name='已爬取页数')),
|
||||
('websites', models.ManyToManyField(to='crawler.website', verbose_name='目标网站')),
|
||||
],
|
||||
options={
|
||||
'verbose_name': '爬取任务',
|
||||
'verbose_name_plural': '爬取任务',
|
||||
'ordering': ['-created_at'],
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='CrawlLog',
|
||||
fields=[
|
||||
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('level', models.CharField(choices=[('info', '信息'), ('warning', '警告'), ('error', '错误'), ('debug', '调试')], max_length=20, verbose_name='日志级别')),
|
||||
('message', models.TextField(verbose_name='日志消息')),
|
||||
('created_at', models.DateTimeField(auto_now_add=True, verbose_name='记录时间')),
|
||||
('task', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='logs', to='crawler.crawltask', verbose_name='所属任务')),
|
||||
('website', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, to='crawler.website', verbose_name='相关网站')),
|
||||
],
|
||||
options={
|
||||
'verbose_name': '爬取日志',
|
||||
'verbose_name_plural': '爬取日志',
|
||||
'ordering': ['-created_at'],
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='CrawledContent',
|
||||
fields=[
|
||||
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('title', models.CharField(max_length=500, verbose_name='标题')),
|
||||
('content', models.TextField(verbose_name='内容')),
|
||||
('url', models.URLField(verbose_name='原文链接')),
|
||||
('publish_date', models.DateTimeField(blank=True, null=True, verbose_name='发布时间')),
|
||||
('author', models.CharField(blank=True, max_length=100, verbose_name='作者')),
|
||||
('keywords_matched', models.TextField(help_text='匹配到的关键字,用逗号分隔', verbose_name='匹配的关键字')),
|
||||
('created_at', models.DateTimeField(auto_now_add=True, verbose_name='爬取时间')),
|
||||
('task', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='contents', to='crawler.crawltask', verbose_name='所属任务')),
|
||||
('website', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='crawler.website', verbose_name='来源网站')),
|
||||
],
|
||||
options={
|
||||
'verbose_name': '爬取内容',
|
||||
'verbose_name_plural': '爬取内容',
|
||||
'ordering': ['-created_at'],
|
||||
'indexes': [models.Index(fields=['task', 'website'], name='crawler_cra_task_id_6244e7_idx'), models.Index(fields=['created_at'], name='crawler_cra_created_a116d2_idx'), models.Index(fields=['publish_date'], name='crawler_cra_publish_5b8ccc_idx')],
|
||||
},
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,24 @@
|
||||
# Generated by Django 5.2.6 on 2025-09-23 00:38
|
||||
|
||||
import crawler.models
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('crawler', '0001_initial'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name='crawledcontent',
|
||||
name='is_local_saved',
|
||||
field=models.BooleanField(default=False, verbose_name='是否已本地保存'),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='crawledcontent',
|
||||
name='local_file',
|
||||
field=models.FileField(blank=True, null=True, upload_to=crawler.models.crawled_content_file_path, verbose_name='本地文件'),
|
||||
),
|
||||
]
|
||||
35
crawler/migrations/0003_mediafile.py
Normal file
35
crawler/migrations/0003_mediafile.py
Normal file
@@ -0,0 +1,35 @@
|
||||
# Generated by Django 5.2.6 on 2025-09-23 01:05
|
||||
|
||||
import crawler.models
|
||||
import django.db.models.deletion
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('crawler', '0002_crawledcontent_is_local_saved_and_more'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='MediaFile',
|
||||
fields=[
|
||||
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('media_type', models.CharField(choices=[('image', '图片'), ('video', '视频'), ('audio', '音频'), ('document', '文档')], max_length=20, verbose_name='媒体类型')),
|
||||
('original_url', models.URLField(verbose_name='原始URL')),
|
||||
('local_file', models.FileField(upload_to=crawler.models.media_file_path, verbose_name='本地文件')),
|
||||
('file_size', models.BigIntegerField(blank=True, null=True, verbose_name='文件大小(字节)')),
|
||||
('mime_type', models.CharField(blank=True, max_length=100, verbose_name='MIME类型')),
|
||||
('alt_text', models.CharField(blank=True, max_length=500, verbose_name='替代文本')),
|
||||
('created_at', models.DateTimeField(auto_now_add=True, verbose_name='创建时间')),
|
||||
('content', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='media_files', to='crawler.crawledcontent', verbose_name='所属内容')),
|
||||
],
|
||||
options={
|
||||
'verbose_name': '媒体文件',
|
||||
'verbose_name_plural': '媒体文件',
|
||||
'ordering': ['-created_at'],
|
||||
'indexes': [models.Index(fields=['content', 'media_type'], name='crawler_med_content_3a9468_idx'), models.Index(fields=['created_at'], name='crawler_med_created_13ff00_idx')],
|
||||
},
|
||||
),
|
||||
]
|
||||
0
crawler/migrations/__init__.py
Normal file
0
crawler/migrations/__init__.py
Normal file
195
crawler/models.py
Normal file
195
crawler/models.py
Normal file
@@ -0,0 +1,195 @@
|
||||
from django.db import models
|
||||
from django.utils import timezone
|
||||
import os
|
||||
|
||||
|
||||
def crawled_content_file_path(instance, filename):
|
||||
"""生成爬取内容文件的存储路径"""
|
||||
# 使用任务ID和时间戳创建唯一文件名
|
||||
timestamp = timezone.now().strftime('%Y%m%d_%H%M%S')
|
||||
name, ext = os.path.splitext(filename)
|
||||
return f'crawled_content/{instance.task.id}/{timestamp}_{instance.id}{ext}'
|
||||
|
||||
|
||||
def media_file_path(instance, filename):
|
||||
"""生成媒体文件的存储路径"""
|
||||
# 使用任务ID和内容ID创建媒体文件路径
|
||||
timestamp = timezone.now().strftime('%Y%m%d_%H%M%S')
|
||||
name, ext = os.path.splitext(filename)
|
||||
return f'{instance.content.task.id}/{instance.content.id}/{timestamp}_{name}{ext}'
|
||||
|
||||
|
||||
class Website(models.Model):
|
||||
"""目标网站模型"""
|
||||
name = models.CharField(max_length=100, verbose_name='网站名称')
|
||||
url = models.URLField(verbose_name='网站地址')
|
||||
region = models.CharField(max_length=50, verbose_name='所属地区')
|
||||
is_active = models.BooleanField(default=True, verbose_name='是否启用')
|
||||
created_at = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')
|
||||
updated_at = models.DateTimeField(auto_now=True, verbose_name='更新时间')
|
||||
|
||||
class Meta:
|
||||
verbose_name = '目标网站'
|
||||
verbose_name_plural = '目标网站'
|
||||
ordering = ['region', 'name']
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.region} - {self.name}"
|
||||
|
||||
|
||||
class CrawlTask(models.Model):
|
||||
"""爬取任务模型"""
|
||||
TASK_STATUS_CHOICES = [
|
||||
('pending', '待执行'),
|
||||
('running', '执行中'),
|
||||
('completed', '已完成'),
|
||||
('failed', '执行失败'),
|
||||
('cancelled', '已取消'),
|
||||
]
|
||||
|
||||
name = models.CharField(max_length=200, verbose_name='任务名称')
|
||||
keywords = models.TextField(verbose_name='搜索关键字', help_text='多个关键字用逗号分隔')
|
||||
websites = models.ManyToManyField(Website, verbose_name='目标网站')
|
||||
status = models.CharField(max_length=20, choices=TASK_STATUS_CHOICES, default='pending', verbose_name='任务状态')
|
||||
created_by = models.CharField(max_length=100, verbose_name='创建者', default='system')
|
||||
created_at = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')
|
||||
started_at = models.DateTimeField(null=True, blank=True, verbose_name='开始时间')
|
||||
completed_at = models.DateTimeField(null=True, blank=True, verbose_name='完成时间')
|
||||
error_message = models.TextField(blank=True, verbose_name='错误信息')
|
||||
total_pages = models.IntegerField(default=0, verbose_name='总页数')
|
||||
crawled_pages = models.IntegerField(default=0, verbose_name='已爬取页数')
|
||||
|
||||
class Meta:
|
||||
verbose_name = '爬取任务'
|
||||
verbose_name_plural = '爬取任务'
|
||||
ordering = ['-created_at']
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.name} - {self.get_status_display()}"
|
||||
|
||||
@property
|
||||
def progress_percentage(self):
|
||||
"""计算任务进度百分比"""
|
||||
if self.total_pages == 0:
|
||||
return 0
|
||||
return round((self.crawled_pages / self.total_pages) * 100, 2)
|
||||
|
||||
|
||||
class CrawledContent(models.Model):
|
||||
"""爬取内容模型"""
|
||||
task = models.ForeignKey(CrawlTask, on_delete=models.CASCADE, related_name='contents', verbose_name='所属任务')
|
||||
website = models.ForeignKey(Website, on_delete=models.CASCADE, verbose_name='来源网站')
|
||||
title = models.CharField(max_length=500, verbose_name='标题')
|
||||
content = models.TextField(verbose_name='内容')
|
||||
url = models.URLField(verbose_name='原文链接')
|
||||
publish_date = models.DateTimeField(null=True, blank=True, verbose_name='发布时间')
|
||||
author = models.CharField(max_length=100, blank=True, verbose_name='作者')
|
||||
keywords_matched = models.TextField(verbose_name='匹配的关键字', help_text='匹配到的关键字,用逗号分隔')
|
||||
created_at = models.DateTimeField(auto_now_add=True, verbose_name='爬取时间')
|
||||
|
||||
# 添加本地存储字段
|
||||
local_file = models.FileField(upload_to=crawled_content_file_path, blank=True, null=True, verbose_name='本地文件')
|
||||
is_local_saved = models.BooleanField(default=False, verbose_name='是否已本地保存')
|
||||
|
||||
class Meta:
|
||||
verbose_name = '爬取内容'
|
||||
verbose_name_plural = '爬取内容'
|
||||
ordering = ['-created_at']
|
||||
indexes = [
|
||||
models.Index(fields=['task', 'website']),
|
||||
models.Index(fields=['created_at']),
|
||||
models.Index(fields=['publish_date']),
|
||||
]
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.website.name} - {self.title[:50]}"
|
||||
|
||||
def get_preview_content(self, max_length=500):
|
||||
"""获取预览内容"""
|
||||
if len(self.content) <= max_length:
|
||||
return self.content
|
||||
return self.content[:max_length] + '...'
|
||||
|
||||
|
||||
class MediaFile(models.Model):
|
||||
"""媒体文件模型"""
|
||||
MEDIA_TYPE_CHOICES = [
|
||||
('image', '图片'),
|
||||
('video', '视频'),
|
||||
('audio', '音频'),
|
||||
('document', '文档'),
|
||||
]
|
||||
|
||||
content = models.ForeignKey(CrawledContent, on_delete=models.CASCADE, related_name='media_files', verbose_name='所属内容')
|
||||
media_type = models.CharField(max_length=20, choices=MEDIA_TYPE_CHOICES, verbose_name='媒体类型')
|
||||
original_url = models.URLField(verbose_name='原始URL')
|
||||
local_file = models.FileField(upload_to=media_file_path, verbose_name='本地文件')
|
||||
file_size = models.BigIntegerField(null=True, blank=True, verbose_name='文件大小(字节)')
|
||||
mime_type = models.CharField(max_length=100, blank=True, verbose_name='MIME类型')
|
||||
alt_text = models.CharField(max_length=500, blank=True, verbose_name='替代文本')
|
||||
created_at = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')
|
||||
|
||||
class Meta:
|
||||
verbose_name = '媒体文件'
|
||||
verbose_name_plural = '媒体文件'
|
||||
ordering = ['-created_at']
|
||||
indexes = [
|
||||
models.Index(fields=['content', 'media_type']),
|
||||
models.Index(fields=['created_at']),
|
||||
]
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.get_media_type_display()} - {self.original_url}"
|
||||
|
||||
@property
|
||||
def file_size_display(self):
|
||||
"""显示文件大小"""
|
||||
if not self.file_size:
|
||||
return "未知"
|
||||
|
||||
size = self.file_size
|
||||
for unit in ['B', 'KB', 'MB', 'GB']:
|
||||
if size < 1024.0:
|
||||
return f"{size:.1f} {unit}"
|
||||
size /= 1024.0
|
||||
return f"{size:.1f} TB"
|
||||
|
||||
|
||||
class CrawlLog(models.Model):
|
||||
"""爬取日志模型"""
|
||||
LOG_LEVEL_CHOICES = [
|
||||
('info', '信息'),
|
||||
('warning', '警告'),
|
||||
('error', '错误'),
|
||||
('debug', '调试'),
|
||||
]
|
||||
|
||||
task = models.ForeignKey(CrawlTask, on_delete=models.CASCADE, related_name='logs', verbose_name='所属任务')
|
||||
website = models.ForeignKey(Website, on_delete=models.CASCADE, null=True, blank=True, verbose_name='相关网站')
|
||||
level = models.CharField(max_length=20, choices=LOG_LEVEL_CHOICES, verbose_name='日志级别')
|
||||
message = models.TextField(verbose_name='日志消息')
|
||||
created_at = models.DateTimeField(auto_now_add=True, verbose_name='记录时间')
|
||||
|
||||
class Meta:
|
||||
verbose_name = '爬取日志'
|
||||
verbose_name_plural = '爬取日志'
|
||||
ordering = ['-created_at']
|
||||
|
||||
def __str__(self):
|
||||
return f"[{self.get_level_display()}] {self.message[:100]}"
|
||||
|
||||
|
||||
class SearchKeyword(models.Model):
|
||||
"""搜索关键字模型"""
|
||||
keyword = models.CharField(max_length=100, unique=True, verbose_name='关键字')
|
||||
is_active = models.BooleanField(default=True, verbose_name='是否启用')
|
||||
created_at = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')
|
||||
last_used = models.DateTimeField(null=True, blank=True, verbose_name='最后使用时间')
|
||||
|
||||
class Meta:
|
||||
verbose_name = '搜索关键字'
|
||||
verbose_name_plural = '搜索关键字'
|
||||
ordering = ['-last_used', '-created_at']
|
||||
|
||||
def __str__(self):
|
||||
return self.keyword
|
||||
51
crawler/serializers.py
Normal file
51
crawler/serializers.py
Normal file
@@ -0,0 +1,51 @@
|
||||
from rest_framework import serializers
|
||||
from .models import Website, CrawlTask, CrawledContent, CrawlLog, SearchKeyword, MediaFile
|
||||
|
||||
|
||||
class WebsiteSerializer(serializers.ModelSerializer):
|
||||
class Meta:
|
||||
model = Website
|
||||
fields = '__all__'
|
||||
|
||||
|
||||
class CrawlTaskSerializer(serializers.ModelSerializer):
|
||||
websites = WebsiteSerializer(many=True, read_only=True)
|
||||
progress_percentage = serializers.ReadOnlyField()
|
||||
|
||||
class Meta:
|
||||
model = CrawlTask
|
||||
fields = '__all__'
|
||||
|
||||
|
||||
class MediaFileSerializer(serializers.ModelSerializer):
|
||||
file_size_display = serializers.ReadOnlyField()
|
||||
|
||||
class Meta:
|
||||
model = MediaFile
|
||||
fields = '__all__'
|
||||
|
||||
|
||||
class CrawledContentSerializer(serializers.ModelSerializer):
|
||||
website_name = serializers.CharField(source='website.name', read_only=True)
|
||||
website_region = serializers.CharField(source='website.region', read_only=True)
|
||||
task_name = serializers.CharField(source='task.name', read_only=True)
|
||||
media_files = MediaFileSerializer(many=True, read_only=True)
|
||||
|
||||
class Meta:
|
||||
model = CrawledContent
|
||||
fields = '__all__'
|
||||
|
||||
|
||||
class CrawlLogSerializer(serializers.ModelSerializer):
|
||||
website_name = serializers.CharField(source='website.name', read_only=True)
|
||||
task_name = serializers.CharField(source='task.name', read_only=True)
|
||||
|
||||
class Meta:
|
||||
model = CrawlLog
|
||||
fields = '__all__'
|
||||
|
||||
|
||||
class SearchKeywordSerializer(serializers.ModelSerializer):
|
||||
class Meta:
|
||||
model = SearchKeyword
|
||||
fields = '__all__'
|
||||
36
crawler/tasks.py
Normal file
36
crawler/tasks.py
Normal file
@@ -0,0 +1,36 @@
|
||||
from celery import shared_task
|
||||
from .crawler_engine import run_crawl_task as execute_crawl_task
|
||||
|
||||
|
||||
@shared_task
|
||||
def crawl_websites_task(task_id):
|
||||
"""爬取网站的Celery任务"""
|
||||
return execute_crawl_task(task_id)
|
||||
|
||||
|
||||
@shared_task
|
||||
def run_crawl_task(task_id):
|
||||
"""执行爬取任务的Celery任务(为管理界面提供)"""
|
||||
return execute_crawl_task(task_id)
|
||||
|
||||
|
||||
@shared_task
|
||||
def cleanup_old_tasks():
|
||||
"""清理旧任务(保留最近30天的任务)"""
|
||||
from django.utils import timezone
|
||||
from datetime import timedelta
|
||||
from .models import CrawlTask, CrawlLog, CrawledContent
|
||||
|
||||
cutoff_date = timezone.now() - timedelta(days=30)
|
||||
|
||||
# 删除30天前的任务及其相关数据
|
||||
old_tasks = CrawlTask.objects.filter(created_at__lt=cutoff_date)
|
||||
count = old_tasks.count()
|
||||
|
||||
for task in old_tasks:
|
||||
# 删除相关的内容和日志
|
||||
CrawledContent.objects.filter(task=task).delete()
|
||||
CrawlLog.objects.filter(task=task).delete()
|
||||
task.delete()
|
||||
|
||||
return f"清理了 {count} 个旧任务"
|
||||
80
crawler/templates/crawler/base.html
Normal file
80
crawler/templates/crawler/base.html
Normal file
@@ -0,0 +1,80 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="zh-CN">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>{% block title %}网站爬虫系统{% endblock %}</title>
|
||||
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/css/bootstrap.min.css" rel="stylesheet">
|
||||
<link href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.7.2/font/bootstrap-icons.css" rel="stylesheet">
|
||||
<style>
|
||||
.navbar-brand {
|
||||
font-weight: bold;
|
||||
}
|
||||
.stats-card {
|
||||
transition: transform 0.2s;
|
||||
}
|
||||
.stats-card:hover {
|
||||
transform: translateY(-2px);
|
||||
}
|
||||
.content-preview {
|
||||
max-height: 100px;
|
||||
overflow: hidden;
|
||||
text-overflow: ellipsis;
|
||||
}
|
||||
.keyword-badge {
|
||||
background-color: #e3f2fd;
|
||||
color: #1976d2;
|
||||
padding: 2px 8px;
|
||||
border-radius: 12px;
|
||||
font-size: 0.8em;
|
||||
margin-right: 5px;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<nav class="navbar navbar-expand-lg navbar-dark bg-primary">
|
||||
<div class="container">
|
||||
<a class="navbar-brand" href="{% url 'dashboard' %}">
|
||||
<i class="bi bi-search"></i> 网站爬虫系统
|
||||
</a>
|
||||
<button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbarNav">
|
||||
<span class="navbar-toggler-icon"></span>
|
||||
</button>
|
||||
<div class="collapse navbar-collapse" id="navbarNav">
|
||||
<ul class="navbar-nav me-auto">
|
||||
<li class="nav-item">
|
||||
<a class="nav-link" href="{% url 'dashboard' %}">
|
||||
<i class="bi bi-house"></i> 仪表板
|
||||
</a>
|
||||
</li>
|
||||
<li class="nav-item">
|
||||
<a class="nav-link" href="{% url 'search' %}">
|
||||
<i class="bi bi-search"></i> 搜索
|
||||
</a>
|
||||
</li>
|
||||
<li class="nav-item">
|
||||
<a class="nav-link" href="/admin/">
|
||||
<i class="bi bi-gear"></i> 管理后台
|
||||
</a>
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
</nav>
|
||||
|
||||
<main class="container mt-4">
|
||||
{% block content %}
|
||||
{% endblock %}
|
||||
</main>
|
||||
|
||||
<footer class="bg-light mt-5 py-4">
|
||||
<div class="container text-center">
|
||||
<p class="text-muted mb-0">网站爬虫系统 © 2024</p>
|
||||
</div>
|
||||
</footer>
|
||||
|
||||
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/js/bootstrap.bundle.min.js"></script>
|
||||
{% block extra_js %}
|
||||
{% endblock %}
|
||||
</body>
|
||||
</html>
|
||||
320
crawler/templates/crawler/dashboard.html
Normal file
320
crawler/templates/crawler/dashboard.html
Normal file
@@ -0,0 +1,320 @@
|
||||
{% extends 'crawler/base.html' %}
|
||||
{% load custom_filters %}
|
||||
|
||||
{% block title %}仪表板 - 网站爬虫系统{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<div class="row">
|
||||
<div class="col-12">
|
||||
<h1 class="mb-4">
|
||||
<i class="bi bi-speedometer2"></i> 系统仪表板
|
||||
</h1>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- 统计卡片 -->
|
||||
<div class="row mb-4">
|
||||
<div class="col-md-3 mb-3">
|
||||
<div class="card stats-card bg-primary text-white">
|
||||
<div class="card-body">
|
||||
<div class="d-flex justify-content-between">
|
||||
<div>
|
||||
<h4 class="card-title">{{ stats.total_websites }}</h4>
|
||||
<p class="card-text">监控网站</p>
|
||||
</div>
|
||||
<div class="align-self-center">
|
||||
<i class="bi bi-globe fs-1"></i>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="col-md-3 mb-3">
|
||||
<div class="card stats-card bg-success text-white">
|
||||
<div class="card-body">
|
||||
<div class="d-flex justify-content-between">
|
||||
<div>
|
||||
<h4 class="card-title">{{ stats.total_tasks }}</h4>
|
||||
<p class="card-text">爬取任务</p>
|
||||
</div>
|
||||
<div class="align-self-center">
|
||||
<i class="bi bi-list-task fs-1"></i>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="col-md-3 mb-3">
|
||||
<div class="card stats-card bg-info text-white">
|
||||
<div class="card-body">
|
||||
<div class="d-flex justify-content-between">
|
||||
<div>
|
||||
<h4 class="card-title">{{ stats.total_contents }}</h4>
|
||||
<p class="card-text">爬取内容</p>
|
||||
</div>
|
||||
<div class="align-self-center">
|
||||
<i class="bi bi-file-text fs-1"></i>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="col-md-3 mb-3">
|
||||
<div class="card stats-card bg-warning text-white">
|
||||
<div class="card-body">
|
||||
<div class="d-flex justify-content-between">
|
||||
<div>
|
||||
<h4 class="card-title">{{ stats.active_tasks }}</h4>
|
||||
<p class="card-text">运行中任务</p>
|
||||
</div>
|
||||
<div class="align-self-center">
|
||||
<i class="bi bi-arrow-clockwise fs-1"></i>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="row">
|
||||
<!-- 网站筛选和分页控制 -->
|
||||
<div class="col-12 mb-3">
|
||||
<div class="card">
|
||||
<div class="card-body">
|
||||
<div class="row">
|
||||
<div class="col-md-6">
|
||||
<form method="get" class="d-flex">
|
||||
<select name="website" class="form-select me-2" onchange="this.form.submit()">
|
||||
<option value="">所有网站</option>
|
||||
{% for website in stats.websites %}
|
||||
<option value="{{ website.id }}" {% if website.id == stats.selected_website_id %}selected{% endif %}>
|
||||
{{ website.name }} ({{ website.region }})
|
||||
</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
|
||||
<select name="page_size" class="form-select me-2" onchange="this.form.submit()">
|
||||
<option value="10" {% if stats.page_size == 10 %}selected{% endif %}>10条/页</option>
|
||||
<option value="20" {% if stats.page_size == 20 %}selected{% endif %}>20条/页</option>
|
||||
<option value="50" {% if stats.page_size == 50 %}selected{% endif %}>50条/页</option>
|
||||
<option value="100" {% if stats.page_size == 100 %}selected{% endif %}>100条/页</option>
|
||||
</select>
|
||||
|
||||
<noscript>
|
||||
<button type="submit" class="btn btn-primary">应用</button>
|
||||
</noscript>
|
||||
</form>
|
||||
</div>
|
||||
<div class="col-md-6">
|
||||
<!-- 分页导航 -->
|
||||
{% if stats.page_obj.has_other_pages %}
|
||||
<nav aria-label="页面导航">
|
||||
<ul class="pagination justify-content-end mb-0">
|
||||
{% if stats.page_obj.has_previous %}
|
||||
<li class="page-item">
|
||||
<a class="page-link" href="?page={{ stats.page_obj.previous_page_number }}{% if stats.selected_website_id %}&website={{ stats.selected_website_id }}{% endif %}{% if stats.page_size %}&page_size={{ stats.page_size }}{% endif %}" aria-label="上一页">
|
||||
<span aria-hidden="true">«</span>
|
||||
</a>
|
||||
</li>
|
||||
{% endif %}
|
||||
|
||||
{% for num in stats.page_obj.paginator.page_range %}
|
||||
{% if stats.page_obj.number == num %}
|
||||
<li class="page-item active">
|
||||
<span class="page-link">{{ num }}</span>
|
||||
</li>
|
||||
{% elif num > stats.page_obj.number|add:'-3' and num < stats.page_obj.number|add:'3' %}
|
||||
<li class="page-item">
|
||||
<a class="page-link" href="?page={{ num }}{% if stats.selected_website_id %}&website={{ stats.selected_website_id }}{% endif %}{% if stats.page_size %}&page_size={{ stats.page_size }}{% endif %}">{{ num }}</a>
|
||||
</li>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
|
||||
{% if stats.page_obj.has_next %}
|
||||
<li class="page-item">
|
||||
<a class="page-link" href="?page={{ stats.page_obj.next_page_number }}{% if stats.selected_website_id %}&website={{ stats.selected_website_id }}{% endif %}{% if stats.page_size %}&page_size={{ stats.page_size }}{% endif %}" aria-label="下一页">
|
||||
<span aria-hidden="true">»</span>
|
||||
</a>
|
||||
</li>
|
||||
{% endif %}
|
||||
</ul>
|
||||
</nav>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- 按网站分类显示内容 -->
|
||||
<div class="col-md-8">
|
||||
{% for website_name, contents in stats.contents_by_website.items %}
|
||||
<div class="card mb-4">
|
||||
<div class="card-header">
|
||||
<h5 class="card-title mb-0">
|
||||
<i class="bi bi-globe"></i> {{ website_name }}
|
||||
<span class="badge bg-secondary">{{ contents|length }}</span>
|
||||
</h5>
|
||||
</div>
|
||||
<div class="card-body">
|
||||
<div class="list-group list-group-flush">
|
||||
{% for content in contents %}
|
||||
<div class="list-group-item">
|
||||
<div class="d-flex w-100 justify-content-between">
|
||||
<h6 class="mb-1">
|
||||
{% if content.is_local_saved %}
|
||||
<a href="{% url 'preview_crawled_content' content.id %}" target="_blank" class="text-decoration-none">
|
||||
{{ content.title|truncatechars:60 }}
|
||||
</a>
|
||||
{% else %}
|
||||
<a href="{{ content.url }}" target="_blank" class="text-decoration-none">
|
||||
{{ content.title|truncatechars:60 }}
|
||||
</a>
|
||||
{% endif %}
|
||||
</h6>
|
||||
<small class="text-muted">{{ content.created_at|date:"m-d H:i" }}</small>
|
||||
</div>
|
||||
<p class="mb-1 content-preview">{{ content.content|truncatechars:100 }}</p>
|
||||
<div class="d-flex justify-content-between align-items-center">
|
||||
<small class="text-muted">
|
||||
<i class="bi bi-geo-alt"></i> {{ content.website.region }}
|
||||
{% if content.media_files.count > 0 %}
|
||||
| <i class="bi bi-image"></i> {{ content.media_files.count }} 个媒体文件
|
||||
{% endif %}
|
||||
</small>
|
||||
<div>
|
||||
{% for keyword in content.keywords_matched|split:"," %}
|
||||
<span class="keyword-badge">{{ keyword|strip }}</span>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{% empty %}
|
||||
<div class="card">
|
||||
<div class="card-body text-center">
|
||||
<p class="text-muted py-3">暂无爬取内容</p>
|
||||
</div>
|
||||
</div>
|
||||
{% endfor %}
|
||||
|
||||
<!-- 分页信息 -->
|
||||
{% if stats.page_obj.has_other_pages %}
|
||||
<div class="card">
|
||||
<div class="card-body">
|
||||
<div class="d-flex justify-content-between align-items-center">
|
||||
<div>
|
||||
显示第 {{ stats.page_obj.start_index }} 到 {{ stats.page_obj.end_index }} 条,共 {{ stats.page_obj.paginator.count }} 条记录
|
||||
</div>
|
||||
<div>
|
||||
<!-- 分页导航(重复显示,方便用户操作) -->
|
||||
<nav aria-label="页面导航">
|
||||
<ul class="pagination mb-0">
|
||||
{% if stats.page_obj.has_previous %}
|
||||
<li class="page-item">
|
||||
<a class="page-link" href="?page={{ stats.page_obj.previous_page_number }}{% if stats.selected_website_id %}&website={{ stats.selected_website_id }}{% endif %}{% if stats.page_size %}&page_size={{ stats.page_size }}{% endif %}" aria-label="上一页">
|
||||
<span aria-hidden="true">«</span>
|
||||
</a>
|
||||
</li>
|
||||
{% endif %}
|
||||
|
||||
{% for num in stats.page_obj.paginator.page_range %}
|
||||
{% if stats.page_obj.number == num %}
|
||||
<li class="page-item active">
|
||||
<span class="page-link">{{ num }}</span>
|
||||
</li>
|
||||
{% elif num > stats.page_obj.number|add:'-3' and num < stats.page_obj.number|add:'3' %}
|
||||
<li class="page-item">
|
||||
<a class="page-link" href="?page={{ num }}{% if stats.selected_website_id %}&website={{ stats.selected_website_id }}{% endif %}{% if stats.page_size %}&page_size={{ stats.page_size }}{% endif %}">{{ num }}</a>
|
||||
</li>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
|
||||
{% if stats.page_obj.has_next %}
|
||||
<li class="page-item">
|
||||
<a class="page-link" href="?page={{ stats.page_obj.next_page_number }}{% if stats.selected_website_id %}&website={{ stats.selected_website_id }}{% endif %}{% if stats.page_size %}&page_size={{ stats.page_size }}{% endif %}" aria-label="下一页">
|
||||
<span aria-hidden="true">»</span>
|
||||
</a>
|
||||
</li>
|
||||
{% endif %}
|
||||
</ul>
|
||||
</nav>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{% endif %}
|
||||
</div>
|
||||
|
||||
<!-- 最近的任务 -->
|
||||
<div class="col-md-4">
|
||||
<div class="card">
|
||||
<div class="card-header">
|
||||
<h5 class="card-title mb-0">
|
||||
<i class="bi bi-list-check"></i> 最近的任务
|
||||
</h5>
|
||||
</div>
|
||||
<div class="card-body">
|
||||
{% if stats.recent_tasks %}
|
||||
<div class="list-group list-group-flush">
|
||||
{% for task in stats.recent_tasks %}
|
||||
<div class="list-group-item">
|
||||
<div class="d-flex w-100 justify-content-between">
|
||||
<h6 class="mb-1">{{ task.name|truncatechars:30 }}</h6>
|
||||
<span class="badge bg-{% if task.status == 'completed' %}success{% elif task.status == 'failed' %}danger{% elif task.status == 'running' %}warning{% else %}secondary{% endif %}">
|
||||
{{ task.get_status_display }}
|
||||
</span>
|
||||
</div>
|
||||
<p class="mb-1">
|
||||
<small class="text-muted">关键字: {{ task.keywords|truncatechars:40 }}</small>
|
||||
</p>
|
||||
<small class="text-muted">{{ task.created_at|date:"m-d H:i" }}</small>
|
||||
</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
{% else %}
|
||||
<p class="text-muted text-center py-3">暂无任务</p>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- 快速操作 -->
|
||||
<div class="row mt-4">
|
||||
<div class="col-12">
|
||||
<div class="card">
|
||||
<div class="card-header">
|
||||
<h5 class="card-title mb-0">
|
||||
<i class="bi bi-lightning"></i> 快速操作
|
||||
</h5>
|
||||
</div>
|
||||
<div class="card-body">
|
||||
<div class="row">
|
||||
<div class="col-md-4 mb-3">
|
||||
<a href="{% url 'search' %}" class="btn btn-primary w-100">
|
||||
<i class="bi bi-search"></i> 搜索内容
|
||||
</a>
|
||||
</div>
|
||||
<div class="col-md-4 mb-3">
|
||||
<a href="/admin/crawler/crawltask/add/" class="btn btn-success w-100">
|
||||
<i class="bi bi-plus-circle"></i> 创建任务
|
||||
</a>
|
||||
</div>
|
||||
<div class="col-md-4 mb-3">
|
||||
<a href="/admin/" class="btn btn-outline-secondary w-100">
|
||||
<i class="bi bi-gear"></i> 管理后台
|
||||
</a>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{% endblock %}
|
||||
128
crawler/templates/crawler/search.html
Normal file
128
crawler/templates/crawler/search.html
Normal file
@@ -0,0 +1,128 @@
|
||||
{% extends 'crawler/base.html' %}
|
||||
{% load custom_filters %}
|
||||
|
||||
{% block title %}搜索内容 - 网站爬虫系统{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<div class="row">
|
||||
<div class="col-12">
|
||||
<h1 class="mb-4">
|
||||
<i class="bi bi-search"></i> 内容搜索
|
||||
</h1>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- 搜索表单 -->
|
||||
<div class="row mb-4">
|
||||
<div class="col-12">
|
||||
<div class="card">
|
||||
<div class="card-body">
|
||||
<form method="get" action="{% url 'search' %}">
|
||||
<div class="input-group input-group-lg">
|
||||
<input type="text"
|
||||
class="form-control"
|
||||
name="q"
|
||||
value="{{ keyword }}"
|
||||
placeholder="输入关键字搜索内容..."
|
||||
required>
|
||||
<button class="btn btn-primary" type="submit">
|
||||
<i class="bi bi-search"></i> 搜索
|
||||
</button>
|
||||
</div>
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- 搜索结果 -->
|
||||
{% if keyword %}
|
||||
<div class="row">
|
||||
<div class="col-12">
|
||||
<div class="card">
|
||||
<div class="card-header">
|
||||
<h5 class="card-title mb-0">
|
||||
<i class="bi bi-list-ul"></i> 搜索结果
|
||||
{% if contents %}
|
||||
<span class="badge bg-primary ms-2">{{ contents|length }} 条结果</span>
|
||||
{% endif %}
|
||||
</h5>
|
||||
</div>
|
||||
<div class="card-body">
|
||||
{% if contents %}
|
||||
<div class="list-group list-group-flush">
|
||||
{% for content in contents %}
|
||||
<div class="list-group-item">
|
||||
<div class="d-flex w-100 justify-content-between">
|
||||
<h5 class="mb-1">
|
||||
{% if content.is_local_saved %}
|
||||
<a href="{% url 'preview_crawled_content' content.id %}" target="_blank" class="text-decoration-none">
|
||||
{{ content.title }}
|
||||
</a>
|
||||
{% else %}
|
||||
<a href="{{ content.url }}" target="_blank" class="text-decoration-none">
|
||||
{{ content.title }}
|
||||
</a>
|
||||
{% endif %}
|
||||
</h5>
|
||||
<small class="text-muted">{{ content.created_at|date:"Y-m-d H:i" }}</small>
|
||||
</div>
|
||||
<p class="mb-2 content-preview">{{ content.content|truncatechars:200 }}</p>
|
||||
<div class="d-flex justify-content-between align-items-center">
|
||||
<small class="text-muted">
|
||||
<i class="bi bi-geo-alt"></i> {{ content.website.region }} - {{ content.website.name }}
|
||||
{% if content.author %}
|
||||
| <i class="bi bi-person"></i> {{ content.author }}
|
||||
{% endif %}
|
||||
{% if content.publish_date %}
|
||||
| <i class="bi bi-calendar"></i> {{ content.publish_date|date:"Y-m-d" }}
|
||||
{% endif %}
|
||||
{% if content.media_files.count > 0 %}
|
||||
| <i class="bi bi-image"></i> {{ content.media_files.count }} 个媒体文件
|
||||
{% endif %}
|
||||
</small>
|
||||
<div>
|
||||
{% for keyword in content.keywords_matched|split:"," %}
|
||||
<span class="keyword-badge">{{ keyword|strip }}</span>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
{% else %}
|
||||
<div class="text-center py-5">
|
||||
<i class="bi bi-search fs-1 text-muted"></i>
|
||||
<p class="text-muted mt-3">没有找到包含 "{{ keyword }}" 的内容</p>
|
||||
<p class="text-muted">请尝试其他关键字或检查爬取任务是否正常运行</p>
|
||||
</div>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{% else %}
|
||||
<!-- 搜索提示 -->
|
||||
<div class="row">
|
||||
<div class="col-12">
|
||||
<div class="card">
|
||||
<div class="card-body text-center py-5">
|
||||
<i class="bi bi-search fs-1 text-muted"></i>
|
||||
<h4 class="text-muted mt-3">开始搜索</h4>
|
||||
<p class="text-muted">在上方输入框中输入关键字,搜索已爬取的内容</p>
|
||||
<div class="mt-4">
|
||||
<h6>搜索建议:</h6>
|
||||
<div class="d-flex flex-wrap justify-content-center gap-2">
|
||||
<span class="badge bg-light text-dark">反腐败</span>
|
||||
<span class="badge bg-light text-dark">纪律检查</span>
|
||||
<span class="badge bg-light text-dark">监督</span>
|
||||
<span class="badge bg-light text-dark">廉政</span>
|
||||
<span class="badge bg-light text-dark">违纪</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{% endif %}
|
||||
{% endblock %}
|
||||
0
crawler/templatetags/__init__.py
Normal file
0
crawler/templatetags/__init__.py
Normal file
32
crawler/templatetags/custom_filters.py
Normal file
32
crawler/templatetags/custom_filters.py
Normal file
@@ -0,0 +1,32 @@
|
||||
from django import template
|
||||
|
||||
register = template.Library()
|
||||
|
||||
|
||||
@register.filter
|
||||
def split(value, separator=','):
|
||||
"""Split a string by separator"""
|
||||
if not value:
|
||||
return []
|
||||
return value.split(separator)
|
||||
|
||||
|
||||
@register.filter
|
||||
def strip(value):
|
||||
"""Strip whitespace from a string"""
|
||||
if not value:
|
||||
return ''
|
||||
return value.strip()
|
||||
|
||||
|
||||
@register.filter
|
||||
def div(value, divisor):
|
||||
"""Divide value by divisor"""
|
||||
try:
|
||||
value = float(value)
|
||||
divisor = float(divisor)
|
||||
if divisor == 0:
|
||||
return 0
|
||||
return value / divisor
|
||||
except (ValueError, TypeError):
|
||||
return 0
|
||||
3
crawler/tests.py
Normal file
3
crawler/tests.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from django.test import TestCase
|
||||
|
||||
# Create your tests here.
|
||||
8
crawler/urls.py
Normal file
8
crawler/urls.py
Normal file
@@ -0,0 +1,8 @@
|
||||
from django.urls import path, include
|
||||
from . import views
|
||||
|
||||
urlpatterns = [
|
||||
path('', views.dashboard, name='dashboard'),
|
||||
path('search/', views.search_page, name='search'),
|
||||
path('crawled-content/<int:content_id>/preview/', views.preview_crawled_content, name='preview_crawled_content'),
|
||||
]
|
||||
292
crawler/views.py
Normal file
292
crawler/views.py
Normal file
@@ -0,0 +1,292 @@
|
||||
from django.shortcuts import render, get_object_or_404
|
||||
from django.http import HttpResponse
|
||||
from django.db.models import Q, Count
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
from .models import Website, CrawlTask, CrawledContent, CrawlLog, SearchKeyword
|
||||
from rest_framework import viewsets, filters
|
||||
from rest_framework.decorators import action
|
||||
from rest_framework.response import Response
|
||||
from .serializers import (
|
||||
WebsiteSerializer, CrawlTaskSerializer, CrawledContentSerializer,
|
||||
CrawlLogSerializer, SearchKeywordSerializer
|
||||
)
|
||||
import json
|
||||
from django.core.paginator import Paginator
|
||||
from django.db.models.functions import TruncDate
|
||||
from django.db.models import Count
|
||||
|
||||
|
||||
def dashboard(request):
|
||||
"""仪表板视图"""
|
||||
# 统计数据
|
||||
total_websites = Website.objects.filter(is_active=True).count()
|
||||
total_tasks = CrawlTask.objects.count()
|
||||
total_contents = CrawledContent.objects.count()
|
||||
active_tasks = CrawlTask.objects.filter(status='running').count()
|
||||
|
||||
# 获取所有网站
|
||||
websites = Website.objects.filter(is_active=True).order_by('name')
|
||||
|
||||
# 获取当前选中的网站ID
|
||||
selected_website_id = request.GET.get('website')
|
||||
|
||||
# 获取分页参数
|
||||
page_number = request.GET.get('page', 1)
|
||||
page_size = request.GET.get('page_size', 20) # 默认每页20篇文章
|
||||
|
||||
# 尝试转换page_size为整数
|
||||
try:
|
||||
page_size = int(page_size)
|
||||
# 限制page_size在合理范围内
|
||||
page_size = max(10, min(100, page_size))
|
||||
except (ValueError, TypeError):
|
||||
page_size = 20
|
||||
|
||||
# 获取所有爬取的内容,按网站和创建时间排序
|
||||
all_contents = CrawledContent.objects.select_related('website').order_by('website__name', '-created_at')
|
||||
|
||||
# 如果选择了特定网站,则进行过滤
|
||||
if selected_website_id:
|
||||
try:
|
||||
selected_website_id = int(selected_website_id)
|
||||
all_contents = all_contents.filter(website_id=selected_website_id)
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
# 分页处理
|
||||
paginator = Paginator(all_contents, page_size)
|
||||
page_obj = paginator.get_page(page_number)
|
||||
|
||||
# 按网站分组内容
|
||||
contents_by_website = {}
|
||||
for content in page_obj:
|
||||
website_name = content.website.name
|
||||
if website_name not in contents_by_website:
|
||||
contents_by_website[website_name] = []
|
||||
contents_by_website[website_name].append(content)
|
||||
|
||||
# 最近的任务
|
||||
recent_tasks = CrawlTask.objects.order_by('-created_at')[:5]
|
||||
|
||||
# 媒体文件统计
|
||||
total_media_files = CrawledContent.objects.aggregate(
|
||||
total_media=Count('media_files')
|
||||
)['total_media'] or 0
|
||||
|
||||
stats = {
|
||||
'total_websites': total_websites,
|
||||
'total_tasks': total_tasks,
|
||||
'total_contents': total_contents,
|
||||
'active_tasks': active_tasks,
|
||||
'websites': websites,
|
||||
'selected_website_id': selected_website_id,
|
||||
'page_obj': page_obj,
|
||||
'contents_by_website': contents_by_website,
|
||||
'page_size': page_size,
|
||||
'recent_tasks': recent_tasks,
|
||||
'total_media_files': total_media_files,
|
||||
}
|
||||
|
||||
return render(request, 'crawler/dashboard.html', {'stats': stats})
|
||||
|
||||
|
||||
def search_page(request):
|
||||
"""搜索页面视图"""
|
||||
keyword = request.GET.get('q', '').strip()
|
||||
contents = []
|
||||
|
||||
if keyword:
|
||||
# 记录搜索关键字
|
||||
SearchKeyword.objects.get_or_create(
|
||||
keyword=keyword,
|
||||
defaults={'last_used': timezone.now()}
|
||||
)
|
||||
|
||||
# 搜索内容
|
||||
contents = CrawledContent.objects.filter(
|
||||
Q(title__icontains=keyword) |
|
||||
Q(content__icontains=keyword) |
|
||||
Q(keywords_matched__icontains=keyword)
|
||||
).order_by('-created_at')[:50]
|
||||
|
||||
return render(request, 'crawler/search.html', {
|
||||
'keyword': keyword,
|
||||
'contents': contents
|
||||
})
|
||||
|
||||
|
||||
def preview_crawled_content(request, content_id):
|
||||
"""预览爬取的内容"""
|
||||
content = get_object_or_404(CrawledContent, id=content_id)
|
||||
|
||||
# 获取媒体文件
|
||||
media_files = content.media_files.all()
|
||||
|
||||
# 生成媒体文件HTML
|
||||
media_section = ""
|
||||
if media_files:
|
||||
media_section = """
|
||||
<div class="media-section">
|
||||
<h3>媒体文件</h3>
|
||||
"""
|
||||
for media_file in media_files:
|
||||
if media_file.media_type == 'image':
|
||||
media_section += f"""
|
||||
<div class="media-item">
|
||||
<h4>图片: {media_file.alt_text or '无标题'}</h4>
|
||||
<img src="/media/{media_file.local_file.name}" alt="{media_file.alt_text}" style="max-width: 100%; height: auto;">
|
||||
<p><small>原始URL: <a href="{media_file.original_url}" target="_blank">{media_file.original_url}</a></small></p>
|
||||
<p><small>文件大小: {media_file.file_size_display}</small></p>
|
||||
</div>
|
||||
"""
|
||||
elif media_file.media_type == 'video':
|
||||
media_section += f"""
|
||||
<div class="media-item">
|
||||
<h4>视频</h4>
|
||||
<video controls style="max-width: 100%;">
|
||||
<source src="/media/{media_file.local_file.name}" type="{media_file.mime_type}">
|
||||
您的浏览器不支持视频播放。
|
||||
</video>
|
||||
<p><small>原始URL: <a href="{media_file.original_url}" target="_blank">{media_file.original_url}</a></small></p>
|
||||
<p><small>文件大小: {media_file.file_size_display}</small></p>
|
||||
</div>
|
||||
"""
|
||||
elif media_file.media_type == 'audio':
|
||||
media_section += f"""
|
||||
<div class="media-item">
|
||||
<h4>音频</h4>
|
||||
<audio controls>
|
||||
<source src="/media/{media_file.local_file.name}" type="{media_file.mime_type}">
|
||||
您的浏览器不支持音频播放。
|
||||
</audio>
|
||||
<p><small>原始URL: <a href="{media_file.original_url}" target="_blank">{media_file.original_url}</a></small></p>
|
||||
<p><small>文件大小: {media_file.file_size_display}</small></p>
|
||||
</div>
|
||||
"""
|
||||
else:
|
||||
media_section += f"""
|
||||
<div class="media-item">
|
||||
<h4>文件: {media_file.get_media_type_display()}</h4>
|
||||
<p><a href="/media/{media_file.local_file.name}" download>下载文件</a></p>
|
||||
<p><small>原始URL: <a href="{media_file.original_url}" target="_blank">{media_file.original_url}</a></small></p>
|
||||
<p><small>文件大小: {media_file.file_size_display}</small></p>
|
||||
</div>
|
||||
"""
|
||||
media_section += " </div>"
|
||||
|
||||
# 处理内容格式,将换行符转换为段落和<br>标签
|
||||
formatted_content = content.content.replace('\n\n', '</p><p>').replace('\n', '<br>')
|
||||
|
||||
# 动态生成预览页面
|
||||
html_content = f"""
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>{content.title}</title>
|
||||
<style>
|
||||
body {{
|
||||
font-family: Arial, sans-serif;
|
||||
margin: 40px;
|
||||
line-height: 1.6;
|
||||
max-width: 1200px;
|
||||
margin: 40px auto;
|
||||
}}
|
||||
h1 {{ color: #333; margin-bottom: 20px; }}
|
||||
.meta {{
|
||||
color: #666;
|
||||
margin-bottom: 30px;
|
||||
padding: 20px;
|
||||
background-color: #f8f9fa;
|
||||
border-radius: 8px;
|
||||
border-left: 4px solid #007bff;
|
||||
}}
|
||||
.content {{
|
||||
line-height: 1.8;
|
||||
font-size: 16px;
|
||||
margin-bottom: 30px;
|
||||
}}
|
||||
.content p {{
|
||||
margin-bottom: 1em;
|
||||
}}
|
||||
.media-section {{
|
||||
margin-top: 30px;
|
||||
padding: 20px;
|
||||
background-color: #f8f9fa;
|
||||
border-radius: 8px;
|
||||
}}
|
||||
.media-item {{
|
||||
margin-bottom: 20px;
|
||||
padding: 15px;
|
||||
border: 1px solid #ddd;
|
||||
border-radius: 5px;
|
||||
background-color: white;
|
||||
}}
|
||||
.media-item h4 {{
|
||||
margin-top: 0;
|
||||
color: #555;
|
||||
border-bottom: 1px solid #eee;
|
||||
padding-bottom: 10px;
|
||||
}}
|
||||
.back-link {{
|
||||
margin-bottom: 20px;
|
||||
}}
|
||||
.back-link a {{
|
||||
color: #007bff;
|
||||
text-decoration: none;
|
||||
font-weight: bold;
|
||||
padding: 8px 16px;
|
||||
background-color: #f8f9fa;
|
||||
border: 1px solid #ddd;
|
||||
border-radius: 4px;
|
||||
}}
|
||||
.back-link a:hover {{
|
||||
text-decoration: underline;
|
||||
background-color: #e9ecef;
|
||||
}}
|
||||
.navbar {{
|
||||
background-color: #007bff;
|
||||
padding: 15px;
|
||||
margin-bottom: 30px;
|
||||
border-radius: 8px;
|
||||
}}
|
||||
.navbar a {{
|
||||
color: white;
|
||||
text-decoration: none;
|
||||
margin-right: 20px;
|
||||
font-weight: bold;
|
||||
}}
|
||||
.navbar a:hover {{
|
||||
text-decoration: underline;
|
||||
}}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="navbar">
|
||||
<a href="/">仪表板</a>
|
||||
<a href="/admin/crawler/crawledcontent/">管理界面</a>
|
||||
<a href="javascript:history.back()">← 返回</a>
|
||||
</div>
|
||||
|
||||
<h1>{content.title}</h1>
|
||||
|
||||
<div class="meta">
|
||||
<p><strong>来源网站:</strong> {content.website.name} ({content.website.region})</p>
|
||||
<p><strong>原始链接:</strong> <a href="{content.url}" target="_blank">{content.url}</a></p>
|
||||
<p><strong>发布时间:</strong> {content.publish_date or '未知'}</p>
|
||||
<p><strong>作者:</strong> {content.author or '未知'}</p>
|
||||
<p><strong>匹配关键字:</strong> {content.keywords_matched}</p>
|
||||
<p><strong>爬取时间:</strong> {content.created_at}</p>
|
||||
<p><strong>媒体文件数量:</strong> {len(media_files)}</p>
|
||||
</div>
|
||||
|
||||
<div class="content">
|
||||
<p>{formatted_content}</p>
|
||||
</div>
|
||||
|
||||
{media_section}
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
return HttpResponse(html_content, content_type='text/html; charset=utf-8')
|
||||
4
crawler_project/__init__.py
Normal file
4
crawler_project/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
# 这将确保Celery应用在Django启动时被加载
|
||||
from .celery import app as celery_app
|
||||
|
||||
__all__ = ('celery_app',)
|
||||
16
crawler_project/asgi.py
Normal file
16
crawler_project/asgi.py
Normal file
@@ -0,0 +1,16 @@
|
||||
"""
|
||||
ASGI config for crawler_project project.
|
||||
|
||||
It exposes the ASGI callable as a module-level variable named ``application``.
|
||||
|
||||
For more information on this file, see
|
||||
https://docs.djangoproject.com/en/5.2/howto/deployment/asgi/
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
from django.core.asgi import get_asgi_application
|
||||
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'crawler_project.settings')
|
||||
|
||||
application = get_asgi_application()
|
||||
17
crawler_project/celery.py
Normal file
17
crawler_project/celery.py
Normal file
@@ -0,0 +1,17 @@
|
||||
import os
|
||||
from celery import Celery
|
||||
|
||||
# 设置Django设置模块
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'crawler_project.settings')
|
||||
|
||||
app = Celery('crawler_project')
|
||||
|
||||
# 使用Django设置文件配置Celery
|
||||
app.config_from_object('django.conf:settings', namespace='CELERY')
|
||||
|
||||
# 自动发现任务
|
||||
app.autodiscover_tasks()
|
||||
|
||||
@app.task(bind=True)
|
||||
def debug_task(self):
|
||||
print(f'Request: {self.request!r}')
|
||||
181
crawler_project/settings.py
Normal file
181
crawler_project/settings.py
Normal file
@@ -0,0 +1,181 @@
|
||||
"""
|
||||
Django settings for crawler_project project.
|
||||
|
||||
Generated by 'django-admin startproject' using Django 5.2.6.
|
||||
|
||||
For more information on this file, see
|
||||
https://docs.djangoproject.com/en/5.2/topics/settings/
|
||||
|
||||
For the full list of settings and their values, see
|
||||
https://docs.djangoproject.com/en/5.2/ref/settings/
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
# Build paths inside the project like this: BASE_DIR / 'subdir'.
|
||||
BASE_DIR = Path(__file__).resolve().parent.parent
|
||||
|
||||
|
||||
# Quick-start development settings - unsuitable for production
|
||||
# See https://docs.djangoproject.com/en/5.2/howto/deployment/checklist/
|
||||
|
||||
# SECURITY WARNING: keep the secret key used in production secret!
|
||||
SECRET_KEY = 'django-insecure-w5lm159dl-)=z!dysfxf8!n^o26^6)4^!@5(yp*5-_c=!_tcq!'
|
||||
|
||||
# SECURITY WARNING: don't run with debug turned on in production!
|
||||
DEBUG = True
|
||||
|
||||
ALLOWED_HOSTS = []
|
||||
|
||||
|
||||
# Application definition
|
||||
|
||||
INSTALLED_APPS = [
|
||||
'django.contrib.admin',
|
||||
'django.contrib.auth',
|
||||
'django.contrib.contenttypes',
|
||||
'django.contrib.sessions',
|
||||
'django.contrib.messages',
|
||||
'django.contrib.staticfiles',
|
||||
'crawler',
|
||||
'rest_framework',
|
||||
]
|
||||
|
||||
MIDDLEWARE = [
|
||||
'django.middleware.security.SecurityMiddleware',
|
||||
'django.contrib.sessions.middleware.SessionMiddleware',
|
||||
'django.middleware.common.CommonMiddleware',
|
||||
'django.middleware.csrf.CsrfViewMiddleware',
|
||||
'django.contrib.auth.middleware.AuthenticationMiddleware',
|
||||
'django.contrib.messages.middleware.MessageMiddleware',
|
||||
'django.middleware.clickjacking.XFrameOptionsMiddleware',
|
||||
]
|
||||
|
||||
ROOT_URLCONF = 'crawler_project.urls'
|
||||
|
||||
TEMPLATES = [
|
||||
{
|
||||
'BACKEND': 'django.template.backends.django.DjangoTemplates',
|
||||
'DIRS': [],
|
||||
'APP_DIRS': True,
|
||||
'OPTIONS': {
|
||||
'context_processors': [
|
||||
'django.template.context_processors.request',
|
||||
'django.contrib.auth.context_processors.auth',
|
||||
'django.contrib.messages.context_processors.messages',
|
||||
],
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
WSGI_APPLICATION = 'crawler_project.wsgi.application'
|
||||
|
||||
|
||||
# Database
|
||||
# https://docs.djangoproject.com/en/5.2/ref/settings/#databases
|
||||
|
||||
DATABASES = {
|
||||
'default': {
|
||||
'ENGINE': 'django.db.backends.sqlite3',
|
||||
'NAME': BASE_DIR / 'db.sqlite3',
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
# Password validation
|
||||
# https://docs.djangoproject.com/en/5.2/ref/settings/#auth-password-validators
|
||||
|
||||
AUTH_PASSWORD_VALIDATORS = [
|
||||
{
|
||||
'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
|
||||
},
|
||||
{
|
||||
'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
|
||||
},
|
||||
{
|
||||
'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
|
||||
},
|
||||
{
|
||||
'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
# Internationalization
|
||||
# https://docs.djangoproject.com/en/5.2/topics/i18n/
|
||||
|
||||
LANGUAGE_CODE = 'zh-hans'
|
||||
|
||||
TIME_ZONE = 'Asia/Shanghai'
|
||||
|
||||
USE_I18N = True
|
||||
|
||||
USE_TZ = True
|
||||
|
||||
|
||||
# Static files (CSS, JavaScript, Images)
|
||||
# https://docs.djangoproject.com/en/5.2/howto/static-files/
|
||||
|
||||
STATIC_URL = 'static/'
|
||||
|
||||
# Media files (用户上传的文件)
|
||||
MEDIA_URL = '/media/'
|
||||
MEDIA_ROOT = BASE_DIR / 'media'
|
||||
|
||||
# Default primary key field type
|
||||
# https://docs.djangoproject.com/en/5.2/ref/settings/#default-auto-field
|
||||
|
||||
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
|
||||
|
||||
# Celery配置
|
||||
CELERY_BROKER_URL = 'redis://localhost:6379/0'
|
||||
CELERY_RESULT_BACKEND = 'redis://localhost:6379/0'
|
||||
CELERY_ACCEPT_CONTENT = ['json']
|
||||
CELERY_TASK_SERIALIZER = 'json'
|
||||
CELERY_RESULT_SERIALIZER = 'json'
|
||||
CELERY_TIMEZONE = TIME_ZONE
|
||||
|
||||
# 爬虫配置
|
||||
CRAWLER_SETTINGS = {
|
||||
'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||||
'REQUEST_DELAY': 1, # 请求间隔(秒)
|
||||
'TIMEOUT': 30, # 请求超时时间
|
||||
'MAX_RETRIES': 3, # 最大重试次数
|
||||
}
|
||||
|
||||
# 目标网站列表
|
||||
TARGET_WEBSITES = [
|
||||
{'name': '中共中央纪委', 'url': 'https://www.ccdi.gov.cn/', 'region': '中央'},
|
||||
{'name': '北京纪检监察', 'url': 'https://www.bjsupervision.gov.cn/', 'region': '北京'},
|
||||
{'name': '天津纪检监察', 'url': 'https://www.tjjw.gov.cn/', 'region': '天津'},
|
||||
{'name': '河北纪检监察', 'url': 'http://www.hebcdi.gov.cn/', 'region': '河北'},
|
||||
{'name': '山西纪检监察', 'url': 'http://www.sxdi.gov.cn/', 'region': '山西'},
|
||||
{'name': '内蒙古纪检监察', 'url': 'https://www.nmgjjjc.gov.cn/', 'region': '内蒙古'},
|
||||
{'name': '辽宁纪检监察', 'url': 'https://www.lnsjjjc.gov.cn/', 'region': '辽宁'},
|
||||
{'name': '吉林纪检监察', 'url': 'http://ccdijl.gov.cn/', 'region': '吉林'},
|
||||
{'name': '黑龙江纪检监察', 'url': 'https://www.hljjjjc.gov.cn/Hljjjjc/', 'region': '黑龙江'},
|
||||
{'name': '上海纪检监察', 'url': 'https://www.shjjjc.gov.cn/', 'region': '上海'},
|
||||
{'name': '江苏纪检监察', 'url': 'https://www.jssjw.gov.cn/', 'region': '江苏'},
|
||||
{'name': '浙江纪检监察', 'url': 'https://www.zjsjw.gov.cn/shouye/', 'region': '浙江'},
|
||||
{'name': '安徽纪检监察', 'url': 'http://www.ahjjjc.gov.cn/', 'region': '安徽'},
|
||||
{'name': '福建纪检监察', 'url': 'https://www.fjcdi.gov.cn/cms/html/fjsjwjw/index.html', 'region': '福建'},
|
||||
{'name': '江西纪检监察', 'url': 'http://www.jxdi.gov.cn/', 'region': '江西'},
|
||||
{'name': '山东纪检监察', 'url': 'https://www.sdjj.gov.cn/', 'region': '山东'},
|
||||
{'name': '河南纪检监察', 'url': 'https://www.hnsjw.gov.cn/sitesources/hnsjct/page_pc/index.html', 'region': '河南'},
|
||||
{'name': '湖北纪检监察', 'url': 'https://www.hbjwjc.gov.cn/', 'region': '湖北'},
|
||||
{'name': '湖南纪检监察', 'url': 'https://www.sxfj.gov.cn/', 'region': '湖南'},
|
||||
{'name': '广东纪检监察', 'url': 'https://www.gdjct.gd.gov.cn/', 'region': '广东'},
|
||||
{'name': '广西纪检监察', 'url': 'https://www.gxjjw.gov.cn/index.shtml', 'region': '广西'},
|
||||
{'name': '海南纪检监察', 'url': 'https://www.hncdi.gov.cn/web/hnlzw/v2/html/index.jsp', 'region': '海南'},
|
||||
{'name': '重庆纪检监察', 'url': 'https://jjc.cq.gov.cn/', 'region': '重庆'},
|
||||
{'name': '四川纪检监察', 'url': 'https://www.scjc.gov.cn/', 'region': '四川'},
|
||||
{'name': '贵州纪检监察', 'url': 'http://www.gzdis.gov.cn/', 'region': '贵州'},
|
||||
{'name': '云南纪检监察', 'url': 'http://www.ynjjjc.gov.cn/', 'region': '云南'},
|
||||
{'name': '西藏纪检监察', 'url': 'http://www.xzjjw.gov.cn/', 'region': '西藏'},
|
||||
{'name': '陕西纪检监察', 'url': 'https://www.qinfeng.gov.cn/', 'region': '陕西'},
|
||||
{'name': '甘肃纪检监察', 'url': 'http://www.gsjw.gov.cn/', 'region': '甘肃'},
|
||||
{'name': '青海纪检监察', 'url': 'http://www.nxjjjc.gov.cn/', 'region': '青海'},
|
||||
{'name': '宁夏纪检监察', 'url': 'http://www.qhjc.gov.cn/', 'region': '宁夏'},
|
||||
{'name': '新疆纪检监察', 'url': 'https://www.xjjw.gov.cn/', 'region': '新疆'},
|
||||
{'name': '新疆兵团纪检监察', 'url': 'http://btjw.xjbt.gov.cn/', 'region': '新疆兵团'},
|
||||
]
|
||||
29
crawler_project/urls.py
Normal file
29
crawler_project/urls.py
Normal file
@@ -0,0 +1,29 @@
|
||||
"""
|
||||
URL configuration for crawler_project project.
|
||||
|
||||
The `urlpatterns` list routes URLs to views. For more information please see:
|
||||
https://docs.djangoproject.com/en/5.2/topics/http/urls/
|
||||
Examples:
|
||||
Function views
|
||||
1. Add an import: from my_app import views
|
||||
2. Add a URL to urlpatterns: path('', views.home, name='home')
|
||||
Class-based views
|
||||
1. Add an import: from other_app.views import Home
|
||||
2. Add a URL to urlpatterns: path('', Home.as_view(), name='home')
|
||||
Including another URLconf
|
||||
1. Import the include() function: from django.urls import include, path
|
||||
2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
|
||||
"""
|
||||
from django.contrib import admin
|
||||
from django.urls import path, include
|
||||
from django.conf import settings
|
||||
from django.conf.urls.static import static
|
||||
|
||||
urlpatterns = [
|
||||
path('admin/', admin.site.urls),
|
||||
path('', include('crawler.urls')),
|
||||
]
|
||||
|
||||
if settings.DEBUG:
|
||||
urlpatterns += static(settings.STATIC_URL, document_root=settings.STATIC_ROOT)
|
||||
urlpatterns += static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT)
|
||||
16
crawler_project/wsgi.py
Normal file
16
crawler_project/wsgi.py
Normal file
@@ -0,0 +1,16 @@
|
||||
"""
|
||||
WSGI config for crawler_project project.
|
||||
|
||||
It exposes the WSGI callable as a module-level variable named ``application``.
|
||||
|
||||
For more information on this file, see
|
||||
https://docs.djangoproject.com/en/5.2/howto/deployment/wsgi/
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
from django.core.wsgi import get_wsgi_application
|
||||
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'crawler_project.settings')
|
||||
|
||||
application = get_wsgi_application()
|
||||
22
manage.py
Executable file
22
manage.py
Executable file
@@ -0,0 +1,22 @@
|
||||
#!/usr/bin/env python
|
||||
"""Django's command-line utility for administrative tasks."""
|
||||
import os
|
||||
import sys
|
||||
|
||||
|
||||
def main():
|
||||
"""Run administrative tasks."""
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'crawler_project.settings')
|
||||
try:
|
||||
from django.core.management import execute_from_command_line
|
||||
except ImportError as exc:
|
||||
raise ImportError(
|
||||
"Couldn't import Django. Are you sure it's installed and "
|
||||
"available on your PYTHONPATH environment variable? Did you "
|
||||
"forget to activate a virtual environment?"
|
||||
) from exc
|
||||
execute_from_command_line(sys.argv)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
29
requirements.txt
Normal file
29
requirements.txt
Normal file
@@ -0,0 +1,29 @@
|
||||
amqp==5.3.1
|
||||
asgiref==3.9.1
|
||||
beautifulsoup4==4.13.5
|
||||
billiard==4.2.2
|
||||
celery==5.5.3
|
||||
certifi==2025.8.3
|
||||
charset-normalizer==3.4.3
|
||||
click==8.3.0
|
||||
click-didyoumean==0.3.1
|
||||
click-plugins==1.1.1.2
|
||||
click-repl==0.3.0
|
||||
Django==5.2.6
|
||||
djangorestframework==3.15.2
|
||||
idna==3.10
|
||||
kombu==5.5.4
|
||||
lxml==6.0.2
|
||||
packaging==25.0
|
||||
prompt_toolkit==3.0.52
|
||||
python-dateutil==2.9.0.post0
|
||||
redis==6.4.0
|
||||
requests==2.32.5
|
||||
six==1.17.0
|
||||
soupsieve==2.8
|
||||
sqlparse==0.5.3
|
||||
typing_extensions==4.15.0
|
||||
tzdata==2025.2
|
||||
urllib3==2.5.0
|
||||
vine==5.1.0
|
||||
wcwidth==0.2.14
|
||||
49
start.sh
Executable file
49
start.sh
Executable file
@@ -0,0 +1,49 @@
|
||||
#!/bin/bash
|
||||
|
||||
echo "启动网站爬虫系统..."
|
||||
|
||||
# 检查是否在正确的目录
|
||||
if [ ! -f "manage.py" ]; then
|
||||
echo "错误: 请在项目根目录运行此脚本"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 检查Python环境
|
||||
if ! command -v python3 &> /dev/null; then
|
||||
echo "错误: 未找到Python3"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 安装依赖
|
||||
echo "安装依赖..."
|
||||
pip install -r requirements.txt
|
||||
|
||||
# 数据库迁移
|
||||
echo "执行数据库迁移..."
|
||||
python3 manage.py makemigrations
|
||||
python3 manage.py migrate
|
||||
|
||||
# 初始化网站数据
|
||||
echo "初始化网站数据..."
|
||||
python3 manage.py init_websites
|
||||
|
||||
# 创建超级用户(如果不存在)
|
||||
echo "检查超级用户..."
|
||||
python3 manage.py shell -c "
|
||||
from django.contrib.auth import get_user_model
|
||||
User = get_user_model()
|
||||
if not User.objects.filter(username='admin').exists():
|
||||
User.objects.create_superuser('admin', 'admin@example.com', 'admin123')
|
||||
print('创建超级用户: admin/admin123')
|
||||
else:
|
||||
print('超级用户已存在')
|
||||
"
|
||||
|
||||
echo "启动Django服务器..."
|
||||
echo "访问地址: http://localhost:8000"
|
||||
echo "管理后台: http://localhost:8000/admin"
|
||||
echo "用户名: admin, 密码: admin123"
|
||||
echo ""
|
||||
echo "按 Ctrl+C 停止服务器"
|
||||
|
||||
python3 manage.py runserver 0.0.0.0:8000
|
||||
12
start_celery.sh
Executable file
12
start_celery.sh
Executable file
@@ -0,0 +1,12 @@
|
||||
#!/bin/bash
|
||||
|
||||
# 启动Celery Worker
|
||||
echo "启动Celery Worker..."
|
||||
celery -A crawler_project worker --loglevel=info --concurrency=4 &
|
||||
|
||||
# 启动Celery Beat (定时任务)
|
||||
echo "启动Celery Beat..."
|
||||
celery -A crawler_project beat --loglevel=info &
|
||||
|
||||
echo "Celery服务已启动"
|
||||
echo "Worker PID: $!"
|
||||
Reference in New Issue
Block a user