Base setup
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -166,7 +166,7 @@ cython_debug/
|
|||||||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||||
#.idea/
|
.idea/
|
||||||
|
|
||||||
# Ruff stuff:
|
# Ruff stuff:
|
||||||
.ruff_cache/
|
.ruff_cache/
|
||||||
|
|||||||
0
crawler/__init__.py
Normal file
0
crawler/__init__.py
Normal file
199
crawler/admin.py
Normal file
199
crawler/admin.py
Normal file
@@ -0,0 +1,199 @@
|
|||||||
|
from django.contrib import admin
|
||||||
|
from django.utils.html import format_html
|
||||||
|
from django.urls import reverse
|
||||||
|
from django.utils.safestring import mark_safe
|
||||||
|
from .models import Website, CrawlTask, CrawledContent, CrawlLog, SearchKeyword, MediaFile
|
||||||
|
from .tasks import crawl_websites_task
|
||||||
|
|
||||||
|
|
||||||
|
@admin.register(Website)
|
||||||
|
class WebsiteAdmin(admin.ModelAdmin):
|
||||||
|
list_display = ['name', 'region', 'url', 'is_active', 'created_at']
|
||||||
|
list_filter = ['region', 'is_active', 'created_at']
|
||||||
|
search_fields = ['name', 'url', 'region']
|
||||||
|
list_editable = ['is_active']
|
||||||
|
ordering = ['region', 'name']
|
||||||
|
|
||||||
|
|
||||||
|
@admin.register(CrawlTask)
|
||||||
|
class CrawlTaskAdmin(admin.ModelAdmin):
|
||||||
|
list_display = ['name', 'status', 'created_by', 'progress_display', 'created_at', 'completed_at']
|
||||||
|
list_filter = ['status', 'created_by', 'created_at']
|
||||||
|
search_fields = ['name', 'keywords']
|
||||||
|
readonly_fields = ['created_at', 'started_at', 'completed_at', 'progress_display']
|
||||||
|
filter_horizontal = ['websites']
|
||||||
|
actions = ['execute_crawl_task']
|
||||||
|
|
||||||
|
def progress_display(self, obj):
|
||||||
|
"""显示任务进度"""
|
||||||
|
if obj.status == 'completed':
|
||||||
|
color = 'green'
|
||||||
|
elif obj.status == 'failed':
|
||||||
|
color = 'red'
|
||||||
|
elif obj.status == 'running':
|
||||||
|
color = 'orange'
|
||||||
|
else:
|
||||||
|
color = 'gray'
|
||||||
|
|
||||||
|
return format_html(
|
||||||
|
'<span style="color: {};">{}%</span>',
|
||||||
|
color,
|
||||||
|
f'{obj.progress_percentage:.1f} ({obj.crawled_pages}/{obj.total_pages})'
|
||||||
|
)
|
||||||
|
progress_display.short_description = '进度'
|
||||||
|
|
||||||
|
def execute_crawl_task(self, request, queryset):
|
||||||
|
"""执行选中的爬取任务"""
|
||||||
|
for task in queryset:
|
||||||
|
# 更新任务状态为pending
|
||||||
|
task.status = 'pending'
|
||||||
|
task.save()
|
||||||
|
|
||||||
|
# 异步执行爬取任务
|
||||||
|
crawl_websites_task.delay(task.id)
|
||||||
|
|
||||||
|
self.message_user(request, f"已启动 {queryset.count()} 个爬取任务。")
|
||||||
|
execute_crawl_task.short_description = "执行选中的爬取任务"
|
||||||
|
|
||||||
|
|
||||||
|
@admin.register(CrawledContent)
|
||||||
|
class CrawledContentAdmin(admin.ModelAdmin):
|
||||||
|
list_display = ['title_short', 'website', 'task', 'keywords_matched', 'media_count', 'publish_date', 'is_local_saved', 'created_at']
|
||||||
|
list_filter = ['website', 'task', 'created_at', 'publish_date', 'is_local_saved']
|
||||||
|
search_fields = ['title', 'content', 'keywords_matched']
|
||||||
|
readonly_fields = ['created_at', 'preview_content', 'media_files_display']
|
||||||
|
ordering = ['-created_at']
|
||||||
|
|
||||||
|
def title_short(self, obj):
|
||||||
|
"""显示缩短的标题"""
|
||||||
|
return obj.title[:50] + '...' if len(obj.title) > 50 else obj.title
|
||||||
|
title_short.short_description = '标题'
|
||||||
|
|
||||||
|
def media_count(self, obj):
|
||||||
|
"""显示媒体文件数量"""
|
||||||
|
count = obj.media_files.count()
|
||||||
|
if count > 0:
|
||||||
|
return format_html(
|
||||||
|
'<span style="color: green; font-weight: bold;">{}</span>',
|
||||||
|
count
|
||||||
|
)
|
||||||
|
return "0"
|
||||||
|
media_count.short_description = '媒体文件'
|
||||||
|
|
||||||
|
def preview_content(self, obj):
|
||||||
|
"""预览内容"""
|
||||||
|
if obj.is_local_saved:
|
||||||
|
url = reverse('admin:crawled_content_preview', args=[obj.id])
|
||||||
|
return format_html(
|
||||||
|
'<a href="{}" target="_blank" class="button">预览文章</a>',
|
||||||
|
url
|
||||||
|
)
|
||||||
|
elif obj.content:
|
||||||
|
return format_html(
|
||||||
|
'<div style="max-height: 200px; overflow-y: auto; border: 1px solid #ddd; padding: 10px;">{}</div>',
|
||||||
|
obj.get_preview_content(500)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
return "无内容"
|
||||||
|
preview_content.short_description = '内容预览'
|
||||||
|
|
||||||
|
def media_files_display(self, obj):
|
||||||
|
"""显示媒体文件列表"""
|
||||||
|
media_files = obj.media_files.all()
|
||||||
|
if not media_files:
|
||||||
|
return "无媒体文件"
|
||||||
|
|
||||||
|
html = "<div style='max-height: 300px; overflow-y: auto;'>"
|
||||||
|
for media_file in media_files:
|
||||||
|
if media_file.media_type == 'image':
|
||||||
|
html += format_html(
|
||||||
|
'<div style="margin-bottom: 10px; border: 1px solid #ddd; padding: 5px;">'
|
||||||
|
'<strong>图片:</strong> {}<br>'
|
||||||
|
'<img src="/media/{}" style="max-width: 150px; max-height: 150px;" /><br>'
|
||||||
|
'<small>大小: {}</small>'
|
||||||
|
'</div>',
|
||||||
|
media_file.alt_text or '无标题',
|
||||||
|
media_file.local_file.name,
|
||||||
|
media_file.file_size_display
|
||||||
|
)
|
||||||
|
elif media_file.media_type == 'video':
|
||||||
|
html += format_html(
|
||||||
|
'<div style="margin-bottom: 10px; border: 1px solid #ddd; padding: 5px;">'
|
||||||
|
'<strong>视频:</strong><br>'
|
||||||
|
'<video controls style="max-width: 200px; max-height: 150px;">'
|
||||||
|
'<source src="/media/{}" type="{}">'
|
||||||
|
'</video><br>'
|
||||||
|
'<small>大小: {}</small>'
|
||||||
|
'</div>',
|
||||||
|
media_file.local_file.name,
|
||||||
|
media_file.mime_type,
|
||||||
|
media_file.file_size_display
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
html += format_html(
|
||||||
|
'<div style="margin-bottom: 10px; border: 1px solid #ddd; padding: 5px;">'
|
||||||
|
'<strong>{}:</strong> <a href="/media/{}" download>下载</a><br>'
|
||||||
|
'<small>大小: {}</small>'
|
||||||
|
'</div>',
|
||||||
|
media_file.get_media_type_display(),
|
||||||
|
media_file.local_file.name,
|
||||||
|
media_file.file_size_display
|
||||||
|
)
|
||||||
|
html += "</div>"
|
||||||
|
return format_html(html)
|
||||||
|
media_files_display.short_description = '媒体文件'
|
||||||
|
|
||||||
|
|
||||||
|
@admin.register(CrawlLog)
|
||||||
|
class CrawlLogAdmin(admin.ModelAdmin):
|
||||||
|
list_display = ['level', 'message_short', 'website', 'task', 'created_at']
|
||||||
|
list_filter = ['level', 'website', 'task', 'created_at']
|
||||||
|
search_fields = ['message']
|
||||||
|
readonly_fields = ['created_at']
|
||||||
|
ordering = ['-created_at']
|
||||||
|
|
||||||
|
def message_short(self, obj):
|
||||||
|
"""显示缩短的消息"""
|
||||||
|
return obj.message[:100] + '...' if len(obj.message) > 100 else obj.message
|
||||||
|
message_short.short_description = '消息'
|
||||||
|
|
||||||
|
|
||||||
|
@admin.register(MediaFile)
|
||||||
|
class MediaFileAdmin(admin.ModelAdmin):
|
||||||
|
list_display = ['content', 'media_type', 'file_size_display', 'mime_type', 'created_at']
|
||||||
|
list_filter = ['media_type', 'created_at']
|
||||||
|
search_fields = ['content__title', 'original_url', 'alt_text']
|
||||||
|
readonly_fields = ['created_at', 'file_size_display', 'media_preview']
|
||||||
|
ordering = ['-created_at']
|
||||||
|
|
||||||
|
def media_preview(self, obj):
|
||||||
|
"""媒体文件预览"""
|
||||||
|
if obj.media_type == 'image' and obj.local_file:
|
||||||
|
return format_html(
|
||||||
|
'<img src="/media/{}" style="max-width: 200px; max-height: 200px;" />',
|
||||||
|
obj.local_file.name
|
||||||
|
)
|
||||||
|
elif obj.media_type == 'video' and obj.local_file:
|
||||||
|
return format_html(
|
||||||
|
'<video controls style="max-width: 200px; max-height: 200px;"><source src="/media/{}" type="{}"></video>',
|
||||||
|
obj.local_file.name,
|
||||||
|
obj.mime_type
|
||||||
|
)
|
||||||
|
elif obj.media_type == 'audio' and obj.local_file:
|
||||||
|
return format_html(
|
||||||
|
'<audio controls><source src="/media/{}" type="{}"></audio>',
|
||||||
|
obj.local_file.name,
|
||||||
|
obj.mime_type
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
return "无预览"
|
||||||
|
media_preview.short_description = '预览'
|
||||||
|
|
||||||
|
|
||||||
|
@admin.register(SearchKeyword)
|
||||||
|
class SearchKeywordAdmin(admin.ModelAdmin):
|
||||||
|
list_display = ['keyword', 'is_active', 'created_at', 'last_used']
|
||||||
|
list_filter = ['is_active', 'created_at', 'last_used']
|
||||||
|
search_fields = ['keyword']
|
||||||
|
list_editable = ['is_active']
|
||||||
|
ordering = ['-last_used', '-created_at']
|
||||||
6
crawler/apps.py
Normal file
6
crawler/apps.py
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
from django.apps import AppConfig
|
||||||
|
|
||||||
|
|
||||||
|
class CrawlerConfig(AppConfig):
|
||||||
|
default_auto_field = 'django.db.models.BigAutoField'
|
||||||
|
name = 'crawler'
|
||||||
578
crawler/crawler_engine.py
Normal file
578
crawler/crawler_engine.py
Normal file
@@ -0,0 +1,578 @@
|
|||||||
|
import requests
|
||||||
|
import time
|
||||||
|
import re
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import urllib3
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from urllib.parse import urljoin, urlparse
|
||||||
|
from django.conf import settings
|
||||||
|
from django.utils import timezone
|
||||||
|
from django.core.files.base import ContentFile
|
||||||
|
from requests.adapters import HTTPAdapter
|
||||||
|
from urllib3.util.retry import Retry
|
||||||
|
from .models import Website, CrawlTask, CrawledContent, CrawlLog, SearchKeyword, MediaFile
|
||||||
|
|
||||||
|
# 禁用SSL警告
|
||||||
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||||
|
|
||||||
|
# 设置日志记录器
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class WebsiteCrawler:
|
||||||
|
"""网站爬虫引擎"""
|
||||||
|
|
||||||
|
def __init__(self, task_id):
|
||||||
|
self.task = CrawlTask.objects.get(id=task_id)
|
||||||
|
self.keywords = [kw.strip() for kw in self.task.keywords.split(',') if kw.strip()]
|
||||||
|
|
||||||
|
# 创建带重试策略的会话
|
||||||
|
self.session = requests.Session()
|
||||||
|
self.session.headers.update({
|
||||||
|
'User-Agent': settings.CRAWLER_SETTINGS['USER_AGENT']
|
||||||
|
})
|
||||||
|
|
||||||
|
# 设置重试策略
|
||||||
|
retry_strategy = Retry(
|
||||||
|
total=settings.CRAWLER_SETTINGS.get('MAX_RETRIES', 3),
|
||||||
|
backoff_factor=1,
|
||||||
|
status_forcelist=[429, 500, 502, 503, 504],
|
||||||
|
)
|
||||||
|
adapter = HTTPAdapter(max_retries=retry_strategy)
|
||||||
|
self.session.mount("http://", adapter)
|
||||||
|
self.session.mount("https://", adapter)
|
||||||
|
|
||||||
|
# 设置超时
|
||||||
|
self.timeout = settings.CRAWLER_SETTINGS['TIMEOUT']
|
||||||
|
|
||||||
|
def log(self, level, message, website=None):
|
||||||
|
"""记录日志"""
|
||||||
|
CrawlLog.objects.create(
|
||||||
|
task=self.task,
|
||||||
|
website=website,
|
||||||
|
level=level,
|
||||||
|
message=message
|
||||||
|
)
|
||||||
|
# 同时记录到Python日志系统
|
||||||
|
logger.log(getattr(logging, level.upper()), f"Task {self.task.id}: {message}")
|
||||||
|
|
||||||
|
def update_task_status(self, status, **kwargs):
|
||||||
|
"""更新任务状态"""
|
||||||
|
self.task.status = status
|
||||||
|
if status == 'running' and not self.task.started_at:
|
||||||
|
self.task.started_at = timezone.now()
|
||||||
|
elif status in ['completed', 'failed', 'cancelled']:
|
||||||
|
self.task.completed_at = timezone.now()
|
||||||
|
|
||||||
|
for key, value in kwargs.items():
|
||||||
|
setattr(self.task, key, value)
|
||||||
|
self.task.save()
|
||||||
|
|
||||||
|
def extract_text_content(self, soup):
|
||||||
|
"""提取文本内容,保持段落结构"""
|
||||||
|
# 移除脚本和样式标签
|
||||||
|
for script in soup(["script", "style"]):
|
||||||
|
script.decompose()
|
||||||
|
|
||||||
|
# 处理段落标签,保持段落结构
|
||||||
|
paragraphs = []
|
||||||
|
|
||||||
|
# 查找所有段落相关的标签
|
||||||
|
for element in soup.find_all(['p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'br']):
|
||||||
|
if element.name in ['p', 'div']:
|
||||||
|
text = element.get_text().strip()
|
||||||
|
if text:
|
||||||
|
paragraphs.append(text)
|
||||||
|
elif element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
||||||
|
text = element.get_text().strip()
|
||||||
|
if text:
|
||||||
|
paragraphs.append(f"\n{text}\n") # 标题前后加换行
|
||||||
|
elif element.name == 'br':
|
||||||
|
paragraphs.append('\n')
|
||||||
|
|
||||||
|
# 如果没有找到段落标签,使用原来的方法
|
||||||
|
if not paragraphs:
|
||||||
|
text = soup.get_text()
|
||||||
|
# 清理文本但保持换行
|
||||||
|
lines = []
|
||||||
|
for line in text.splitlines():
|
||||||
|
line = line.strip()
|
||||||
|
if line:
|
||||||
|
lines.append(line)
|
||||||
|
return '\n\n'.join(lines)
|
||||||
|
|
||||||
|
# 合并段落,用双换行分隔
|
||||||
|
content = '\n\n'.join(paragraphs)
|
||||||
|
|
||||||
|
# 清理多余的空行
|
||||||
|
import re
|
||||||
|
content = re.sub(r'\n\s*\n\s*\n', '\n\n', content)
|
||||||
|
|
||||||
|
return content.strip()
|
||||||
|
|
||||||
|
def find_article_links(self, soup, base_url):
|
||||||
|
"""查找文章链接"""
|
||||||
|
links = []
|
||||||
|
|
||||||
|
# 常见的文章链接选择器
|
||||||
|
selectors = [
|
||||||
|
'a[href*="article"]',
|
||||||
|
'a[href*="news"]',
|
||||||
|
'a[href*="content"]',
|
||||||
|
'a[href*="detail"]',
|
||||||
|
'a[href*="view"]',
|
||||||
|
'a[href*="show"]',
|
||||||
|
'.news-list a',
|
||||||
|
'.article-list a',
|
||||||
|
'.content-list a',
|
||||||
|
'h3 a',
|
||||||
|
'h4 a',
|
||||||
|
'.title a',
|
||||||
|
'.list-item a'
|
||||||
|
]
|
||||||
|
|
||||||
|
for selector in selectors:
|
||||||
|
elements = soup.select(selector)
|
||||||
|
for element in elements:
|
||||||
|
href = element.get('href')
|
||||||
|
if href:
|
||||||
|
full_url = urljoin(base_url, href)
|
||||||
|
title = element.get_text().strip()
|
||||||
|
if title and len(title) > 5: # 过滤掉太短的标题
|
||||||
|
links.append({
|
||||||
|
'url': full_url,
|
||||||
|
'title': title
|
||||||
|
})
|
||||||
|
|
||||||
|
return links
|
||||||
|
|
||||||
|
def check_keyword_match(self, text, title):
|
||||||
|
"""检查关键字匹配"""
|
||||||
|
matched_keywords = []
|
||||||
|
text_lower = text.lower()
|
||||||
|
title_lower = title.lower()
|
||||||
|
|
||||||
|
for keyword in self.keywords:
|
||||||
|
keyword_lower = keyword.lower()
|
||||||
|
if keyword_lower in text_lower or keyword_lower in title_lower:
|
||||||
|
matched_keywords.append(keyword)
|
||||||
|
|
||||||
|
return matched_keywords
|
||||||
|
|
||||||
|
def extract_article_content(self, url, soup):
|
||||||
|
"""提取文章内容"""
|
||||||
|
# 尝试多种内容选择器
|
||||||
|
content_selectors = [
|
||||||
|
'.article-content',
|
||||||
|
'.content',
|
||||||
|
'.article-body',
|
||||||
|
'.news-content',
|
||||||
|
'.main-content',
|
||||||
|
'.post-content',
|
||||||
|
'article',
|
||||||
|
'.detail-content',
|
||||||
|
'#content',
|
||||||
|
'.text'
|
||||||
|
]
|
||||||
|
|
||||||
|
content = ""
|
||||||
|
for selector in content_selectors:
|
||||||
|
element = soup.select_one(selector)
|
||||||
|
if element:
|
||||||
|
content = self.extract_text_content(element)
|
||||||
|
if len(content) > 100: # 确保内容足够长
|
||||||
|
break
|
||||||
|
|
||||||
|
# 如果没找到特定内容区域,使用整个页面
|
||||||
|
if not content or len(content) < 100:
|
||||||
|
content = self.extract_text_content(soup)
|
||||||
|
|
||||||
|
return content
|
||||||
|
|
||||||
|
def extract_publish_date(self, soup):
|
||||||
|
"""提取发布时间"""
|
||||||
|
date_selectors = [
|
||||||
|
'.publish-time',
|
||||||
|
'.pub-time',
|
||||||
|
'.date',
|
||||||
|
'.time',
|
||||||
|
'.publish-date',
|
||||||
|
'time[datetime]',
|
||||||
|
'.article-time',
|
||||||
|
'.news-time'
|
||||||
|
]
|
||||||
|
|
||||||
|
for selector in date_selectors:
|
||||||
|
element = soup.select_one(selector)
|
||||||
|
if element:
|
||||||
|
date_text = element.get_text().strip()
|
||||||
|
if element.get('datetime'):
|
||||||
|
date_text = element.get('datetime')
|
||||||
|
|
||||||
|
# 尝试解析日期
|
||||||
|
try:
|
||||||
|
from datetime import datetime
|
||||||
|
# 这里可以添加更复杂的日期解析逻辑
|
||||||
|
# 暂时返回当前时间
|
||||||
|
return timezone.now()
|
||||||
|
except:
|
||||||
|
continue
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def extract_author(self, soup):
|
||||||
|
"""提取作者信息"""
|
||||||
|
author_selectors = [
|
||||||
|
'.author',
|
||||||
|
'.writer',
|
||||||
|
'.publisher',
|
||||||
|
'.byline',
|
||||||
|
'.article-author',
|
||||||
|
'.news-author'
|
||||||
|
]
|
||||||
|
|
||||||
|
for selector in author_selectors:
|
||||||
|
element = soup.select_one(selector)
|
||||||
|
if element:
|
||||||
|
return element.get_text().strip()
|
||||||
|
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def download_media_file(self, media_url, crawled_content, media_type='image', alt_text=''):
|
||||||
|
"""下载媒体文件"""
|
||||||
|
try:
|
||||||
|
# 检查URL是否有效
|
||||||
|
if not media_url or not media_url.startswith(('http://', 'https://')):
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 请求媒体文件
|
||||||
|
response = self.session.get(
|
||||||
|
media_url,
|
||||||
|
timeout=self.timeout,
|
||||||
|
verify=False,
|
||||||
|
stream=False # 改为False以确保获取完整内容
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
# 获取文件信息
|
||||||
|
content_type = response.headers.get('content-type', '')
|
||||||
|
content_length = response.headers.get('content-length')
|
||||||
|
file_size = int(content_length) if content_length else len(response.content)
|
||||||
|
|
||||||
|
# 确定文件扩展名
|
||||||
|
file_extension = self.get_file_extension_from_url(media_url, content_type)
|
||||||
|
|
||||||
|
# 生成文件名
|
||||||
|
filename = f"media_{crawled_content.id}_{len(crawled_content.media_files.all())}{file_extension}"
|
||||||
|
|
||||||
|
# 创建媒体文件对象
|
||||||
|
media_file = MediaFile.objects.create(
|
||||||
|
content=crawled_content,
|
||||||
|
media_type=media_type,
|
||||||
|
original_url=media_url,
|
||||||
|
file_size=file_size,
|
||||||
|
mime_type=content_type,
|
||||||
|
alt_text=alt_text
|
||||||
|
)
|
||||||
|
|
||||||
|
# 保存文件
|
||||||
|
media_file.local_file.save(
|
||||||
|
filename,
|
||||||
|
ContentFile(response.content),
|
||||||
|
save=True
|
||||||
|
)
|
||||||
|
|
||||||
|
self.log('info', f'媒体文件已下载: {filename} ({media_type})', crawled_content.website)
|
||||||
|
return media_file
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.log('error', f'下载媒体文件失败 {media_url}: {str(e)}', crawled_content.website)
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_file_extension_from_url(self, url, content_type):
|
||||||
|
"""从URL或内容类型获取文件扩展名"""
|
||||||
|
# 从URL获取扩展名
|
||||||
|
parsed_url = urlparse(url)
|
||||||
|
path = parsed_url.path
|
||||||
|
if '.' in path:
|
||||||
|
return os.path.splitext(path)[1]
|
||||||
|
|
||||||
|
# 从内容类型获取扩展名
|
||||||
|
content_type_map = {
|
||||||
|
'image/jpeg': '.jpg',
|
||||||
|
'image/jpg': '.jpg',
|
||||||
|
'image/png': '.png',
|
||||||
|
'image/gif': '.gif',
|
||||||
|
'image/webp': '.webp',
|
||||||
|
'image/svg+xml': '.svg',
|
||||||
|
'video/mp4': '.mp4',
|
||||||
|
'video/avi': '.avi',
|
||||||
|
'video/mov': '.mov',
|
||||||
|
'video/wmv': '.wmv',
|
||||||
|
'video/flv': '.flv',
|
||||||
|
'video/webm': '.webm',
|
||||||
|
'audio/mp3': '.mp3',
|
||||||
|
'audio/wav': '.wav',
|
||||||
|
'audio/ogg': '.ogg',
|
||||||
|
'application/pdf': '.pdf',
|
||||||
|
'application/msword': '.doc',
|
||||||
|
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
|
||||||
|
}
|
||||||
|
|
||||||
|
return content_type_map.get(content_type.lower(), '.bin')
|
||||||
|
|
||||||
|
def extract_and_download_media(self, soup, crawled_content, base_url):
|
||||||
|
"""提取并下载页面中的媒体文件"""
|
||||||
|
media_files = []
|
||||||
|
|
||||||
|
# 提取图片
|
||||||
|
images = soup.find_all('img')
|
||||||
|
self.log('info', f'找到 {len(images)} 个图片标签', crawled_content.website)
|
||||||
|
|
||||||
|
for img in images:
|
||||||
|
src = img.get('src')
|
||||||
|
if src:
|
||||||
|
# 处理相对URL
|
||||||
|
if src.startswith('//'):
|
||||||
|
src = 'https:' + src
|
||||||
|
elif src.startswith('/'):
|
||||||
|
src = urljoin(base_url, src)
|
||||||
|
elif not src.startswith(('http://', 'https://')):
|
||||||
|
src = urljoin(base_url, src)
|
||||||
|
|
||||||
|
alt_text = img.get('alt', '')
|
||||||
|
self.log('info', f'尝试下载图片: {src}', crawled_content.website)
|
||||||
|
media_file = self.download_media_file(src, crawled_content, 'image', alt_text)
|
||||||
|
if media_file:
|
||||||
|
media_files.append(media_file)
|
||||||
|
self.log('info', f'成功下载图片: {media_file.local_file.name}', crawled_content.website)
|
||||||
|
|
||||||
|
# 提取视频
|
||||||
|
videos = soup.find_all(['video', 'source'])
|
||||||
|
for video in videos:
|
||||||
|
src = video.get('src')
|
||||||
|
if src:
|
||||||
|
# 处理相对URL
|
||||||
|
if src.startswith('//'):
|
||||||
|
src = 'https:' + src
|
||||||
|
elif src.startswith('/'):
|
||||||
|
src = urljoin(base_url, src)
|
||||||
|
elif not src.startswith(('http://', 'https://')):
|
||||||
|
src = urljoin(base_url, src)
|
||||||
|
|
||||||
|
media_file = self.download_media_file(src, crawled_content, 'video')
|
||||||
|
if media_file:
|
||||||
|
media_files.append(media_file)
|
||||||
|
|
||||||
|
# 提取音频
|
||||||
|
audios = soup.find_all('audio')
|
||||||
|
for audio in audios:
|
||||||
|
src = audio.get('src')
|
||||||
|
if src:
|
||||||
|
# 处理相对URL
|
||||||
|
if src.startswith('//'):
|
||||||
|
src = 'https:' + src
|
||||||
|
elif src.startswith('/'):
|
||||||
|
src = urljoin(base_url, src)
|
||||||
|
elif not src.startswith(('http://', 'https://')):
|
||||||
|
src = urljoin(base_url, src)
|
||||||
|
|
||||||
|
media_file = self.download_media_file(src, crawled_content, 'audio')
|
||||||
|
if media_file:
|
||||||
|
media_files.append(media_file)
|
||||||
|
|
||||||
|
return media_files
|
||||||
|
|
||||||
|
def mark_content_saved(self, crawled_content):
|
||||||
|
"""标记内容已保存(内容已存储在数据库中)"""
|
||||||
|
try:
|
||||||
|
crawled_content.is_local_saved = True
|
||||||
|
crawled_content.save()
|
||||||
|
|
||||||
|
media_count = crawled_content.media_files.count()
|
||||||
|
self.log('info', f'文章内容已保存到数据库 (包含 {media_count} 个媒体文件)', crawled_content.website)
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
self.log('error', f'标记内容保存状态失败: {str(e)}', crawled_content.website)
|
||||||
|
return False
|
||||||
|
|
||||||
|
def crawl_website(self, website):
|
||||||
|
"""爬取单个网站"""
|
||||||
|
self.log('info', f'开始爬取网站: {website.name}', website)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 请求主页
|
||||||
|
response = self.session.get(
|
||||||
|
website.url,
|
||||||
|
timeout=self.timeout,
|
||||||
|
verify=False # 忽略SSL证书验证
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
# 检查内容编码
|
||||||
|
if response.encoding != 'utf-8':
|
||||||
|
# 尝试从响应头获取编码
|
||||||
|
content_type = response.headers.get('content-type', '')
|
||||||
|
if 'charset=' in content_type:
|
||||||
|
charset = content_type.split('charset=')[-1]
|
||||||
|
response.encoding = charset
|
||||||
|
else:
|
||||||
|
response.encoding = 'utf-8'
|
||||||
|
|
||||||
|
soup = BeautifulSoup(response.content, 'html.parser')
|
||||||
|
|
||||||
|
# 查找文章链接
|
||||||
|
article_links = self.find_article_links(soup, website.url)
|
||||||
|
self.log('info', f'找到 {len(article_links)} 个文章链接', website)
|
||||||
|
|
||||||
|
crawled_count = 0
|
||||||
|
for link_info in article_links:
|
||||||
|
try:
|
||||||
|
# 请求文章页面
|
||||||
|
article_response = self.session.get(
|
||||||
|
link_info['url'],
|
||||||
|
timeout=self.timeout,
|
||||||
|
verify=False # 忽略SSL证书验证
|
||||||
|
)
|
||||||
|
article_response.raise_for_status()
|
||||||
|
|
||||||
|
# 检查内容编码
|
||||||
|
if article_response.encoding != 'utf-8':
|
||||||
|
# 尝试从响应头获取编码
|
||||||
|
content_type = article_response.headers.get('content-type', '')
|
||||||
|
if 'charset=' in content_type:
|
||||||
|
charset = content_type.split('charset=')[-1]
|
||||||
|
article_response.encoding = charset
|
||||||
|
else:
|
||||||
|
article_response.encoding = 'utf-8'
|
||||||
|
|
||||||
|
article_soup = BeautifulSoup(article_response.content, 'html.parser')
|
||||||
|
|
||||||
|
# 提取内容
|
||||||
|
content = self.extract_article_content(link_info['url'], article_soup)
|
||||||
|
title = link_info['title']
|
||||||
|
|
||||||
|
# 检查关键字匹配
|
||||||
|
matched_keywords = self.check_keyword_match(content, title)
|
||||||
|
|
||||||
|
if matched_keywords:
|
||||||
|
# 提取其他信息
|
||||||
|
publish_date = self.extract_publish_date(article_soup)
|
||||||
|
author = self.extract_author(article_soup)
|
||||||
|
|
||||||
|
# 保存内容
|
||||||
|
crawled_content = CrawledContent.objects.create(
|
||||||
|
task=self.task,
|
||||||
|
website=website,
|
||||||
|
title=title,
|
||||||
|
content=content,
|
||||||
|
url=link_info['url'],
|
||||||
|
publish_date=publish_date,
|
||||||
|
author=author,
|
||||||
|
keywords_matched=','.join(matched_keywords),
|
||||||
|
is_local_saved=False # 初始设置为False,保存到本地后会更新为True
|
||||||
|
)
|
||||||
|
|
||||||
|
# 提取并下载媒体文件
|
||||||
|
media_files = self.extract_and_download_media(article_soup, crawled_content, link_info['url'])
|
||||||
|
|
||||||
|
# 标记内容已保存
|
||||||
|
self.mark_content_saved(crawled_content)
|
||||||
|
|
||||||
|
crawled_count += 1
|
||||||
|
self.log('info', f'保存文章: {title[:50]}...', website)
|
||||||
|
|
||||||
|
# 请求间隔
|
||||||
|
time.sleep(settings.CRAWLER_SETTINGS['REQUEST_DELAY'])
|
||||||
|
|
||||||
|
except requests.exceptions.SSLError as e:
|
||||||
|
self.log('error', f'SSL错误,跳过文章 {link_info["url"]}: {str(e)}', website)
|
||||||
|
continue
|
||||||
|
except requests.exceptions.ConnectionError as e:
|
||||||
|
self.log('error', f'连接错误,跳过文章 {link_info["url"]}: {str(e)}', website)
|
||||||
|
continue
|
||||||
|
except requests.exceptions.Timeout as e:
|
||||||
|
self.log('error', f'请求超时,跳过文章 {link_info["url"]}: {str(e)}', website)
|
||||||
|
continue
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
self.log('error', f'网络请求错误,跳过文章 {link_info["url"]}: {str(e)}', website)
|
||||||
|
continue
|
||||||
|
except UnicodeDecodeError as e:
|
||||||
|
self.log('error', f'字符编码错误,跳过文章 {link_info["url"]}: {str(e)}', website)
|
||||||
|
continue
|
||||||
|
except Exception as e:
|
||||||
|
self.log('error', f'处理文章失败 {link_info["url"]}: {str(e)}', website)
|
||||||
|
continue
|
||||||
|
|
||||||
|
self.log('info', f'网站爬取完成,共保存 {crawled_count} 篇文章', website)
|
||||||
|
return crawled_count
|
||||||
|
|
||||||
|
except requests.exceptions.SSLError as e:
|
||||||
|
self.log('error', f'爬取网站SSL错误: {str(e)}', website)
|
||||||
|
return 0
|
||||||
|
except requests.exceptions.ConnectionError as e:
|
||||||
|
self.log('error', f'爬取网站连接错误: {str(e)}', website)
|
||||||
|
return 0
|
||||||
|
except requests.exceptions.Timeout as e:
|
||||||
|
self.log('error', f'爬取网站超时: {str(e)}', website)
|
||||||
|
return 0
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
self.log('error', f'爬取网站网络错误: {str(e)}', website)
|
||||||
|
return 0
|
||||||
|
except Exception as e:
|
||||||
|
self.log('error', f'爬取网站失败: {str(e)}', website)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
"""运行爬取任务"""
|
||||||
|
self.log('info', f'开始执行爬取任务: {self.task.name}')
|
||||||
|
self.update_task_status('running')
|
||||||
|
|
||||||
|
total_crawled = 0
|
||||||
|
websites = self.task.websites.filter(is_active=True)
|
||||||
|
self.task.total_pages = websites.count()
|
||||||
|
self.task.save()
|
||||||
|
|
||||||
|
for website in websites:
|
||||||
|
try:
|
||||||
|
crawled_count = self.crawl_website(website)
|
||||||
|
total_crawled += crawled_count
|
||||||
|
self.task.crawled_pages += 1
|
||||||
|
self.task.save()
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.log('error', f'爬取网站 {website.name} 时发生错误: {str(e)}', website)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 更新任务状态
|
||||||
|
if total_crawled > 0:
|
||||||
|
self.update_task_status('completed')
|
||||||
|
self.log('info', f'爬取任务完成,共爬取 {total_crawled} 篇文章')
|
||||||
|
else:
|
||||||
|
self.update_task_status('failed', error_message='没有找到匹配的内容')
|
||||||
|
self.log('error', '爬取任务失败,没有找到匹配的内容')
|
||||||
|
|
||||||
|
|
||||||
|
def run_crawl_task(task_id):
|
||||||
|
"""运行爬取任务(Celery任务)"""
|
||||||
|
try:
|
||||||
|
crawler = WebsiteCrawler(task_id)
|
||||||
|
crawler.run()
|
||||||
|
return f"任务 {task_id} 执行完成"
|
||||||
|
except Exception as e:
|
||||||
|
# 记录异常到日志
|
||||||
|
logger.error(f"执行任务 {task_id} 时发生异常: {str(e)}", exc_info=True)
|
||||||
|
|
||||||
|
task = CrawlTask.objects.get(id=task_id)
|
||||||
|
task.status = 'failed'
|
||||||
|
task.error_message = str(e)
|
||||||
|
task.completed_at = timezone.now()
|
||||||
|
task.save()
|
||||||
|
|
||||||
|
CrawlLog.objects.create(
|
||||||
|
task=task,
|
||||||
|
level='error',
|
||||||
|
message=f'任务执行失败: {str(e)}'
|
||||||
|
)
|
||||||
|
return f"任务 {task_id} 执行失败: {str(e)}"
|
||||||
0
crawler/management/__init__.py
Normal file
0
crawler/management/__init__.py
Normal file
0
crawler/management/commands/__init__.py
Normal file
0
crawler/management/commands/__init__.py
Normal file
36
crawler/management/commands/init_websites.py
Normal file
36
crawler/management/commands/init_websites.py
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
from django.core.management.base import BaseCommand
|
||||||
|
from django.conf import settings
|
||||||
|
from crawler.models import Website
|
||||||
|
|
||||||
|
|
||||||
|
class Command(BaseCommand):
|
||||||
|
help = '初始化目标网站数据'
|
||||||
|
|
||||||
|
def handle(self, *args, **options):
|
||||||
|
self.stdout.write('开始初始化目标网站数据...')
|
||||||
|
|
||||||
|
# 清空现有数据
|
||||||
|
Website.objects.all().delete()
|
||||||
|
|
||||||
|
# 从设置中获取网站列表
|
||||||
|
websites_data = settings.TARGET_WEBSITES
|
||||||
|
|
||||||
|
created_count = 0
|
||||||
|
for website_data in websites_data:
|
||||||
|
website, created = Website.objects.get_or_create(
|
||||||
|
url=website_data['url'],
|
||||||
|
defaults={
|
||||||
|
'name': website_data['name'],
|
||||||
|
'region': website_data['region'],
|
||||||
|
'is_active': True
|
||||||
|
}
|
||||||
|
)
|
||||||
|
if created:
|
||||||
|
created_count += 1
|
||||||
|
self.stdout.write(f'创建网站: {website.name}')
|
||||||
|
else:
|
||||||
|
self.stdout.write(f'网站已存在: {website.name}')
|
||||||
|
|
||||||
|
self.stdout.write(
|
||||||
|
self.style.SUCCESS(f'成功初始化 {created_count} 个网站')
|
||||||
|
)
|
||||||
69
crawler/management/commands/run_crawler.py
Normal file
69
crawler/management/commands/run_crawler.py
Normal file
@@ -0,0 +1,69 @@
|
|||||||
|
from django.core.management.base import BaseCommand
|
||||||
|
from crawler.models import CrawlTask, Website
|
||||||
|
from crawler.tasks import crawl_websites_task
|
||||||
|
|
||||||
|
|
||||||
|
class Command(BaseCommand):
|
||||||
|
help = '运行爬虫任务'
|
||||||
|
|
||||||
|
def add_arguments(self, parser):
|
||||||
|
parser.add_argument(
|
||||||
|
'--keywords',
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help='搜索关键字,多个关键字用逗号分隔'
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--websites',
|
||||||
|
type=str,
|
||||||
|
help='网站ID列表,用逗号分隔。不指定则爬取所有网站'
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--name',
|
||||||
|
type=str,
|
||||||
|
help='任务名称'
|
||||||
|
)
|
||||||
|
|
||||||
|
def handle(self, *args, **options):
|
||||||
|
keywords = options['keywords']
|
||||||
|
website_ids = options.get('websites')
|
||||||
|
task_name = options.get('name', f'关键字搜索: {keywords}')
|
||||||
|
|
||||||
|
# 获取目标网站
|
||||||
|
if website_ids:
|
||||||
|
website_id_list = [int(id.strip()) for id in website_ids.split(',')]
|
||||||
|
websites = Website.objects.filter(id__in=website_id_list, is_active=True)
|
||||||
|
else:
|
||||||
|
websites = Website.objects.filter(is_active=True)
|
||||||
|
|
||||||
|
if not websites.exists():
|
||||||
|
self.stdout.write(
|
||||||
|
self.style.ERROR('没有找到可用的网站')
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
# 创建任务
|
||||||
|
task = CrawlTask.objects.create(
|
||||||
|
name=task_name,
|
||||||
|
keywords=keywords,
|
||||||
|
created_by='management_command'
|
||||||
|
)
|
||||||
|
task.websites.set(websites)
|
||||||
|
|
||||||
|
self.stdout.write(f'创建任务: {task.name}')
|
||||||
|
self.stdout.write(f'目标网站: {websites.count()} 个')
|
||||||
|
self.stdout.write(f'搜索关键字: {keywords}')
|
||||||
|
|
||||||
|
# 启动任务(同步模式,如果没有Redis则直接运行)
|
||||||
|
try:
|
||||||
|
crawl_websites_task.delay(task.id)
|
||||||
|
self.stdout.write('任务已提交到队列')
|
||||||
|
except Exception as e:
|
||||||
|
self.stdout.write(f'队列不可用,直接运行任务: {e}')
|
||||||
|
from crawler.crawler_engine import WebsiteCrawler
|
||||||
|
crawler = WebsiteCrawler(task.id)
|
||||||
|
crawler.run()
|
||||||
|
|
||||||
|
self.stdout.write(
|
||||||
|
self.style.SUCCESS(f'任务已启动,任务ID: {task.id}')
|
||||||
|
)
|
||||||
106
crawler/migrations/0001_initial.py
Normal file
106
crawler/migrations/0001_initial.py
Normal file
@@ -0,0 +1,106 @@
|
|||||||
|
# Generated by Django 5.2.6 on 2025-09-22 16:27
|
||||||
|
|
||||||
|
import django.db.models.deletion
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
initial = True
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.CreateModel(
|
||||||
|
name='SearchKeyword',
|
||||||
|
fields=[
|
||||||
|
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||||
|
('keyword', models.CharField(max_length=100, unique=True, verbose_name='关键字')),
|
||||||
|
('is_active', models.BooleanField(default=True, verbose_name='是否启用')),
|
||||||
|
('created_at', models.DateTimeField(auto_now_add=True, verbose_name='创建时间')),
|
||||||
|
('last_used', models.DateTimeField(blank=True, null=True, verbose_name='最后使用时间')),
|
||||||
|
],
|
||||||
|
options={
|
||||||
|
'verbose_name': '搜索关键字',
|
||||||
|
'verbose_name_plural': '搜索关键字',
|
||||||
|
'ordering': ['-last_used', '-created_at'],
|
||||||
|
},
|
||||||
|
),
|
||||||
|
migrations.CreateModel(
|
||||||
|
name='Website',
|
||||||
|
fields=[
|
||||||
|
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||||
|
('name', models.CharField(max_length=100, verbose_name='网站名称')),
|
||||||
|
('url', models.URLField(verbose_name='网站地址')),
|
||||||
|
('region', models.CharField(max_length=50, verbose_name='所属地区')),
|
||||||
|
('is_active', models.BooleanField(default=True, verbose_name='是否启用')),
|
||||||
|
('created_at', models.DateTimeField(auto_now_add=True, verbose_name='创建时间')),
|
||||||
|
('updated_at', models.DateTimeField(auto_now=True, verbose_name='更新时间')),
|
||||||
|
],
|
||||||
|
options={
|
||||||
|
'verbose_name': '目标网站',
|
||||||
|
'verbose_name_plural': '目标网站',
|
||||||
|
'ordering': ['region', 'name'],
|
||||||
|
},
|
||||||
|
),
|
||||||
|
migrations.CreateModel(
|
||||||
|
name='CrawlTask',
|
||||||
|
fields=[
|
||||||
|
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||||
|
('name', models.CharField(max_length=200, verbose_name='任务名称')),
|
||||||
|
('keywords', models.TextField(help_text='多个关键字用逗号分隔', verbose_name='搜索关键字')),
|
||||||
|
('status', models.CharField(choices=[('pending', '待执行'), ('running', '执行中'), ('completed', '已完成'), ('failed', '执行失败'), ('cancelled', '已取消')], default='pending', max_length=20, verbose_name='任务状态')),
|
||||||
|
('created_by', models.CharField(default='system', max_length=100, verbose_name='创建者')),
|
||||||
|
('created_at', models.DateTimeField(auto_now_add=True, verbose_name='创建时间')),
|
||||||
|
('started_at', models.DateTimeField(blank=True, null=True, verbose_name='开始时间')),
|
||||||
|
('completed_at', models.DateTimeField(blank=True, null=True, verbose_name='完成时间')),
|
||||||
|
('error_message', models.TextField(blank=True, verbose_name='错误信息')),
|
||||||
|
('total_pages', models.IntegerField(default=0, verbose_name='总页数')),
|
||||||
|
('crawled_pages', models.IntegerField(default=0, verbose_name='已爬取页数')),
|
||||||
|
('websites', models.ManyToManyField(to='crawler.website', verbose_name='目标网站')),
|
||||||
|
],
|
||||||
|
options={
|
||||||
|
'verbose_name': '爬取任务',
|
||||||
|
'verbose_name_plural': '爬取任务',
|
||||||
|
'ordering': ['-created_at'],
|
||||||
|
},
|
||||||
|
),
|
||||||
|
migrations.CreateModel(
|
||||||
|
name='CrawlLog',
|
||||||
|
fields=[
|
||||||
|
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||||
|
('level', models.CharField(choices=[('info', '信息'), ('warning', '警告'), ('error', '错误'), ('debug', '调试')], max_length=20, verbose_name='日志级别')),
|
||||||
|
('message', models.TextField(verbose_name='日志消息')),
|
||||||
|
('created_at', models.DateTimeField(auto_now_add=True, verbose_name='记录时间')),
|
||||||
|
('task', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='logs', to='crawler.crawltask', verbose_name='所属任务')),
|
||||||
|
('website', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, to='crawler.website', verbose_name='相关网站')),
|
||||||
|
],
|
||||||
|
options={
|
||||||
|
'verbose_name': '爬取日志',
|
||||||
|
'verbose_name_plural': '爬取日志',
|
||||||
|
'ordering': ['-created_at'],
|
||||||
|
},
|
||||||
|
),
|
||||||
|
migrations.CreateModel(
|
||||||
|
name='CrawledContent',
|
||||||
|
fields=[
|
||||||
|
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||||
|
('title', models.CharField(max_length=500, verbose_name='标题')),
|
||||||
|
('content', models.TextField(verbose_name='内容')),
|
||||||
|
('url', models.URLField(verbose_name='原文链接')),
|
||||||
|
('publish_date', models.DateTimeField(blank=True, null=True, verbose_name='发布时间')),
|
||||||
|
('author', models.CharField(blank=True, max_length=100, verbose_name='作者')),
|
||||||
|
('keywords_matched', models.TextField(help_text='匹配到的关键字,用逗号分隔', verbose_name='匹配的关键字')),
|
||||||
|
('created_at', models.DateTimeField(auto_now_add=True, verbose_name='爬取时间')),
|
||||||
|
('task', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='contents', to='crawler.crawltask', verbose_name='所属任务')),
|
||||||
|
('website', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='crawler.website', verbose_name='来源网站')),
|
||||||
|
],
|
||||||
|
options={
|
||||||
|
'verbose_name': '爬取内容',
|
||||||
|
'verbose_name_plural': '爬取内容',
|
||||||
|
'ordering': ['-created_at'],
|
||||||
|
'indexes': [models.Index(fields=['task', 'website'], name='crawler_cra_task_id_6244e7_idx'), models.Index(fields=['created_at'], name='crawler_cra_created_a116d2_idx'), models.Index(fields=['publish_date'], name='crawler_cra_publish_5b8ccc_idx')],
|
||||||
|
},
|
||||||
|
),
|
||||||
|
]
|
||||||
@@ -0,0 +1,24 @@
|
|||||||
|
# Generated by Django 5.2.6 on 2025-09-23 00:38
|
||||||
|
|
||||||
|
import crawler.models
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('crawler', '0001_initial'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='crawledcontent',
|
||||||
|
name='is_local_saved',
|
||||||
|
field=models.BooleanField(default=False, verbose_name='是否已本地保存'),
|
||||||
|
),
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='crawledcontent',
|
||||||
|
name='local_file',
|
||||||
|
field=models.FileField(blank=True, null=True, upload_to=crawler.models.crawled_content_file_path, verbose_name='本地文件'),
|
||||||
|
),
|
||||||
|
]
|
||||||
35
crawler/migrations/0003_mediafile.py
Normal file
35
crawler/migrations/0003_mediafile.py
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
# Generated by Django 5.2.6 on 2025-09-23 01:05
|
||||||
|
|
||||||
|
import crawler.models
|
||||||
|
import django.db.models.deletion
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('crawler', '0002_crawledcontent_is_local_saved_and_more'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.CreateModel(
|
||||||
|
name='MediaFile',
|
||||||
|
fields=[
|
||||||
|
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||||
|
('media_type', models.CharField(choices=[('image', '图片'), ('video', '视频'), ('audio', '音频'), ('document', '文档')], max_length=20, verbose_name='媒体类型')),
|
||||||
|
('original_url', models.URLField(verbose_name='原始URL')),
|
||||||
|
('local_file', models.FileField(upload_to=crawler.models.media_file_path, verbose_name='本地文件')),
|
||||||
|
('file_size', models.BigIntegerField(blank=True, null=True, verbose_name='文件大小(字节)')),
|
||||||
|
('mime_type', models.CharField(blank=True, max_length=100, verbose_name='MIME类型')),
|
||||||
|
('alt_text', models.CharField(blank=True, max_length=500, verbose_name='替代文本')),
|
||||||
|
('created_at', models.DateTimeField(auto_now_add=True, verbose_name='创建时间')),
|
||||||
|
('content', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='media_files', to='crawler.crawledcontent', verbose_name='所属内容')),
|
||||||
|
],
|
||||||
|
options={
|
||||||
|
'verbose_name': '媒体文件',
|
||||||
|
'verbose_name_plural': '媒体文件',
|
||||||
|
'ordering': ['-created_at'],
|
||||||
|
'indexes': [models.Index(fields=['content', 'media_type'], name='crawler_med_content_3a9468_idx'), models.Index(fields=['created_at'], name='crawler_med_created_13ff00_idx')],
|
||||||
|
},
|
||||||
|
),
|
||||||
|
]
|
||||||
0
crawler/migrations/__init__.py
Normal file
0
crawler/migrations/__init__.py
Normal file
195
crawler/models.py
Normal file
195
crawler/models.py
Normal file
@@ -0,0 +1,195 @@
|
|||||||
|
from django.db import models
|
||||||
|
from django.utils import timezone
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
def crawled_content_file_path(instance, filename):
|
||||||
|
"""生成爬取内容文件的存储路径"""
|
||||||
|
# 使用任务ID和时间戳创建唯一文件名
|
||||||
|
timestamp = timezone.now().strftime('%Y%m%d_%H%M%S')
|
||||||
|
name, ext = os.path.splitext(filename)
|
||||||
|
return f'crawled_content/{instance.task.id}/{timestamp}_{instance.id}{ext}'
|
||||||
|
|
||||||
|
|
||||||
|
def media_file_path(instance, filename):
|
||||||
|
"""生成媒体文件的存储路径"""
|
||||||
|
# 使用任务ID和内容ID创建媒体文件路径
|
||||||
|
timestamp = timezone.now().strftime('%Y%m%d_%H%M%S')
|
||||||
|
name, ext = os.path.splitext(filename)
|
||||||
|
return f'{instance.content.task.id}/{instance.content.id}/{timestamp}_{name}{ext}'
|
||||||
|
|
||||||
|
|
||||||
|
class Website(models.Model):
|
||||||
|
"""目标网站模型"""
|
||||||
|
name = models.CharField(max_length=100, verbose_name='网站名称')
|
||||||
|
url = models.URLField(verbose_name='网站地址')
|
||||||
|
region = models.CharField(max_length=50, verbose_name='所属地区')
|
||||||
|
is_active = models.BooleanField(default=True, verbose_name='是否启用')
|
||||||
|
created_at = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')
|
||||||
|
updated_at = models.DateTimeField(auto_now=True, verbose_name='更新时间')
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
verbose_name = '目标网站'
|
||||||
|
verbose_name_plural = '目标网站'
|
||||||
|
ordering = ['region', 'name']
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return f"{self.region} - {self.name}"
|
||||||
|
|
||||||
|
|
||||||
|
class CrawlTask(models.Model):
|
||||||
|
"""爬取任务模型"""
|
||||||
|
TASK_STATUS_CHOICES = [
|
||||||
|
('pending', '待执行'),
|
||||||
|
('running', '执行中'),
|
||||||
|
('completed', '已完成'),
|
||||||
|
('failed', '执行失败'),
|
||||||
|
('cancelled', '已取消'),
|
||||||
|
]
|
||||||
|
|
||||||
|
name = models.CharField(max_length=200, verbose_name='任务名称')
|
||||||
|
keywords = models.TextField(verbose_name='搜索关键字', help_text='多个关键字用逗号分隔')
|
||||||
|
websites = models.ManyToManyField(Website, verbose_name='目标网站')
|
||||||
|
status = models.CharField(max_length=20, choices=TASK_STATUS_CHOICES, default='pending', verbose_name='任务状态')
|
||||||
|
created_by = models.CharField(max_length=100, verbose_name='创建者', default='system')
|
||||||
|
created_at = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')
|
||||||
|
started_at = models.DateTimeField(null=True, blank=True, verbose_name='开始时间')
|
||||||
|
completed_at = models.DateTimeField(null=True, blank=True, verbose_name='完成时间')
|
||||||
|
error_message = models.TextField(blank=True, verbose_name='错误信息')
|
||||||
|
total_pages = models.IntegerField(default=0, verbose_name='总页数')
|
||||||
|
crawled_pages = models.IntegerField(default=0, verbose_name='已爬取页数')
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
verbose_name = '爬取任务'
|
||||||
|
verbose_name_plural = '爬取任务'
|
||||||
|
ordering = ['-created_at']
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return f"{self.name} - {self.get_status_display()}"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def progress_percentage(self):
|
||||||
|
"""计算任务进度百分比"""
|
||||||
|
if self.total_pages == 0:
|
||||||
|
return 0
|
||||||
|
return round((self.crawled_pages / self.total_pages) * 100, 2)
|
||||||
|
|
||||||
|
|
||||||
|
class CrawledContent(models.Model):
|
||||||
|
"""爬取内容模型"""
|
||||||
|
task = models.ForeignKey(CrawlTask, on_delete=models.CASCADE, related_name='contents', verbose_name='所属任务')
|
||||||
|
website = models.ForeignKey(Website, on_delete=models.CASCADE, verbose_name='来源网站')
|
||||||
|
title = models.CharField(max_length=500, verbose_name='标题')
|
||||||
|
content = models.TextField(verbose_name='内容')
|
||||||
|
url = models.URLField(verbose_name='原文链接')
|
||||||
|
publish_date = models.DateTimeField(null=True, blank=True, verbose_name='发布时间')
|
||||||
|
author = models.CharField(max_length=100, blank=True, verbose_name='作者')
|
||||||
|
keywords_matched = models.TextField(verbose_name='匹配的关键字', help_text='匹配到的关键字,用逗号分隔')
|
||||||
|
created_at = models.DateTimeField(auto_now_add=True, verbose_name='爬取时间')
|
||||||
|
|
||||||
|
# 添加本地存储字段
|
||||||
|
local_file = models.FileField(upload_to=crawled_content_file_path, blank=True, null=True, verbose_name='本地文件')
|
||||||
|
is_local_saved = models.BooleanField(default=False, verbose_name='是否已本地保存')
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
verbose_name = '爬取内容'
|
||||||
|
verbose_name_plural = '爬取内容'
|
||||||
|
ordering = ['-created_at']
|
||||||
|
indexes = [
|
||||||
|
models.Index(fields=['task', 'website']),
|
||||||
|
models.Index(fields=['created_at']),
|
||||||
|
models.Index(fields=['publish_date']),
|
||||||
|
]
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return f"{self.website.name} - {self.title[:50]}"
|
||||||
|
|
||||||
|
def get_preview_content(self, max_length=500):
|
||||||
|
"""获取预览内容"""
|
||||||
|
if len(self.content) <= max_length:
|
||||||
|
return self.content
|
||||||
|
return self.content[:max_length] + '...'
|
||||||
|
|
||||||
|
|
||||||
|
class MediaFile(models.Model):
|
||||||
|
"""媒体文件模型"""
|
||||||
|
MEDIA_TYPE_CHOICES = [
|
||||||
|
('image', '图片'),
|
||||||
|
('video', '视频'),
|
||||||
|
('audio', '音频'),
|
||||||
|
('document', '文档'),
|
||||||
|
]
|
||||||
|
|
||||||
|
content = models.ForeignKey(CrawledContent, on_delete=models.CASCADE, related_name='media_files', verbose_name='所属内容')
|
||||||
|
media_type = models.CharField(max_length=20, choices=MEDIA_TYPE_CHOICES, verbose_name='媒体类型')
|
||||||
|
original_url = models.URLField(verbose_name='原始URL')
|
||||||
|
local_file = models.FileField(upload_to=media_file_path, verbose_name='本地文件')
|
||||||
|
file_size = models.BigIntegerField(null=True, blank=True, verbose_name='文件大小(字节)')
|
||||||
|
mime_type = models.CharField(max_length=100, blank=True, verbose_name='MIME类型')
|
||||||
|
alt_text = models.CharField(max_length=500, blank=True, verbose_name='替代文本')
|
||||||
|
created_at = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
verbose_name = '媒体文件'
|
||||||
|
verbose_name_plural = '媒体文件'
|
||||||
|
ordering = ['-created_at']
|
||||||
|
indexes = [
|
||||||
|
models.Index(fields=['content', 'media_type']),
|
||||||
|
models.Index(fields=['created_at']),
|
||||||
|
]
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return f"{self.get_media_type_display()} - {self.original_url}"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def file_size_display(self):
|
||||||
|
"""显示文件大小"""
|
||||||
|
if not self.file_size:
|
||||||
|
return "未知"
|
||||||
|
|
||||||
|
size = self.file_size
|
||||||
|
for unit in ['B', 'KB', 'MB', 'GB']:
|
||||||
|
if size < 1024.0:
|
||||||
|
return f"{size:.1f} {unit}"
|
||||||
|
size /= 1024.0
|
||||||
|
return f"{size:.1f} TB"
|
||||||
|
|
||||||
|
|
||||||
|
class CrawlLog(models.Model):
|
||||||
|
"""爬取日志模型"""
|
||||||
|
LOG_LEVEL_CHOICES = [
|
||||||
|
('info', '信息'),
|
||||||
|
('warning', '警告'),
|
||||||
|
('error', '错误'),
|
||||||
|
('debug', '调试'),
|
||||||
|
]
|
||||||
|
|
||||||
|
task = models.ForeignKey(CrawlTask, on_delete=models.CASCADE, related_name='logs', verbose_name='所属任务')
|
||||||
|
website = models.ForeignKey(Website, on_delete=models.CASCADE, null=True, blank=True, verbose_name='相关网站')
|
||||||
|
level = models.CharField(max_length=20, choices=LOG_LEVEL_CHOICES, verbose_name='日志级别')
|
||||||
|
message = models.TextField(verbose_name='日志消息')
|
||||||
|
created_at = models.DateTimeField(auto_now_add=True, verbose_name='记录时间')
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
verbose_name = '爬取日志'
|
||||||
|
verbose_name_plural = '爬取日志'
|
||||||
|
ordering = ['-created_at']
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return f"[{self.get_level_display()}] {self.message[:100]}"
|
||||||
|
|
||||||
|
|
||||||
|
class SearchKeyword(models.Model):
|
||||||
|
"""搜索关键字模型"""
|
||||||
|
keyword = models.CharField(max_length=100, unique=True, verbose_name='关键字')
|
||||||
|
is_active = models.BooleanField(default=True, verbose_name='是否启用')
|
||||||
|
created_at = models.DateTimeField(auto_now_add=True, verbose_name='创建时间')
|
||||||
|
last_used = models.DateTimeField(null=True, blank=True, verbose_name='最后使用时间')
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
verbose_name = '搜索关键字'
|
||||||
|
verbose_name_plural = '搜索关键字'
|
||||||
|
ordering = ['-last_used', '-created_at']
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return self.keyword
|
||||||
51
crawler/serializers.py
Normal file
51
crawler/serializers.py
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
from rest_framework import serializers
|
||||||
|
from .models import Website, CrawlTask, CrawledContent, CrawlLog, SearchKeyword, MediaFile
|
||||||
|
|
||||||
|
|
||||||
|
class WebsiteSerializer(serializers.ModelSerializer):
|
||||||
|
class Meta:
|
||||||
|
model = Website
|
||||||
|
fields = '__all__'
|
||||||
|
|
||||||
|
|
||||||
|
class CrawlTaskSerializer(serializers.ModelSerializer):
|
||||||
|
websites = WebsiteSerializer(many=True, read_only=True)
|
||||||
|
progress_percentage = serializers.ReadOnlyField()
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
model = CrawlTask
|
||||||
|
fields = '__all__'
|
||||||
|
|
||||||
|
|
||||||
|
class MediaFileSerializer(serializers.ModelSerializer):
|
||||||
|
file_size_display = serializers.ReadOnlyField()
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
model = MediaFile
|
||||||
|
fields = '__all__'
|
||||||
|
|
||||||
|
|
||||||
|
class CrawledContentSerializer(serializers.ModelSerializer):
|
||||||
|
website_name = serializers.CharField(source='website.name', read_only=True)
|
||||||
|
website_region = serializers.CharField(source='website.region', read_only=True)
|
||||||
|
task_name = serializers.CharField(source='task.name', read_only=True)
|
||||||
|
media_files = MediaFileSerializer(many=True, read_only=True)
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
model = CrawledContent
|
||||||
|
fields = '__all__'
|
||||||
|
|
||||||
|
|
||||||
|
class CrawlLogSerializer(serializers.ModelSerializer):
|
||||||
|
website_name = serializers.CharField(source='website.name', read_only=True)
|
||||||
|
task_name = serializers.CharField(source='task.name', read_only=True)
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
model = CrawlLog
|
||||||
|
fields = '__all__'
|
||||||
|
|
||||||
|
|
||||||
|
class SearchKeywordSerializer(serializers.ModelSerializer):
|
||||||
|
class Meta:
|
||||||
|
model = SearchKeyword
|
||||||
|
fields = '__all__'
|
||||||
36
crawler/tasks.py
Normal file
36
crawler/tasks.py
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
from celery import shared_task
|
||||||
|
from .crawler_engine import run_crawl_task as execute_crawl_task
|
||||||
|
|
||||||
|
|
||||||
|
@shared_task
|
||||||
|
def crawl_websites_task(task_id):
|
||||||
|
"""爬取网站的Celery任务"""
|
||||||
|
return execute_crawl_task(task_id)
|
||||||
|
|
||||||
|
|
||||||
|
@shared_task
|
||||||
|
def run_crawl_task(task_id):
|
||||||
|
"""执行爬取任务的Celery任务(为管理界面提供)"""
|
||||||
|
return execute_crawl_task(task_id)
|
||||||
|
|
||||||
|
|
||||||
|
@shared_task
|
||||||
|
def cleanup_old_tasks():
|
||||||
|
"""清理旧任务(保留最近30天的任务)"""
|
||||||
|
from django.utils import timezone
|
||||||
|
from datetime import timedelta
|
||||||
|
from .models import CrawlTask, CrawlLog, CrawledContent
|
||||||
|
|
||||||
|
cutoff_date = timezone.now() - timedelta(days=30)
|
||||||
|
|
||||||
|
# 删除30天前的任务及其相关数据
|
||||||
|
old_tasks = CrawlTask.objects.filter(created_at__lt=cutoff_date)
|
||||||
|
count = old_tasks.count()
|
||||||
|
|
||||||
|
for task in old_tasks:
|
||||||
|
# 删除相关的内容和日志
|
||||||
|
CrawledContent.objects.filter(task=task).delete()
|
||||||
|
CrawlLog.objects.filter(task=task).delete()
|
||||||
|
task.delete()
|
||||||
|
|
||||||
|
return f"清理了 {count} 个旧任务"
|
||||||
80
crawler/templates/crawler/base.html
Normal file
80
crawler/templates/crawler/base.html
Normal file
@@ -0,0 +1,80 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="zh-CN">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||||
|
<title>{% block title %}网站爬虫系统{% endblock %}</title>
|
||||||
|
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/css/bootstrap.min.css" rel="stylesheet">
|
||||||
|
<link href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.7.2/font/bootstrap-icons.css" rel="stylesheet">
|
||||||
|
<style>
|
||||||
|
.navbar-brand {
|
||||||
|
font-weight: bold;
|
||||||
|
}
|
||||||
|
.stats-card {
|
||||||
|
transition: transform 0.2s;
|
||||||
|
}
|
||||||
|
.stats-card:hover {
|
||||||
|
transform: translateY(-2px);
|
||||||
|
}
|
||||||
|
.content-preview {
|
||||||
|
max-height: 100px;
|
||||||
|
overflow: hidden;
|
||||||
|
text-overflow: ellipsis;
|
||||||
|
}
|
||||||
|
.keyword-badge {
|
||||||
|
background-color: #e3f2fd;
|
||||||
|
color: #1976d2;
|
||||||
|
padding: 2px 8px;
|
||||||
|
border-radius: 12px;
|
||||||
|
font-size: 0.8em;
|
||||||
|
margin-right: 5px;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<nav class="navbar navbar-expand-lg navbar-dark bg-primary">
|
||||||
|
<div class="container">
|
||||||
|
<a class="navbar-brand" href="{% url 'dashboard' %}">
|
||||||
|
<i class="bi bi-search"></i> 网站爬虫系统
|
||||||
|
</a>
|
||||||
|
<button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbarNav">
|
||||||
|
<span class="navbar-toggler-icon"></span>
|
||||||
|
</button>
|
||||||
|
<div class="collapse navbar-collapse" id="navbarNav">
|
||||||
|
<ul class="navbar-nav me-auto">
|
||||||
|
<li class="nav-item">
|
||||||
|
<a class="nav-link" href="{% url 'dashboard' %}">
|
||||||
|
<i class="bi bi-house"></i> 仪表板
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
<li class="nav-item">
|
||||||
|
<a class="nav-link" href="{% url 'search' %}">
|
||||||
|
<i class="bi bi-search"></i> 搜索
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
<li class="nav-item">
|
||||||
|
<a class="nav-link" href="/admin/">
|
||||||
|
<i class="bi bi-gear"></i> 管理后台
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
</ul>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</nav>
|
||||||
|
|
||||||
|
<main class="container mt-4">
|
||||||
|
{% block content %}
|
||||||
|
{% endblock %}
|
||||||
|
</main>
|
||||||
|
|
||||||
|
<footer class="bg-light mt-5 py-4">
|
||||||
|
<div class="container text-center">
|
||||||
|
<p class="text-muted mb-0">网站爬虫系统 © 2024</p>
|
||||||
|
</div>
|
||||||
|
</footer>
|
||||||
|
|
||||||
|
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/js/bootstrap.bundle.min.js"></script>
|
||||||
|
{% block extra_js %}
|
||||||
|
{% endblock %}
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
320
crawler/templates/crawler/dashboard.html
Normal file
320
crawler/templates/crawler/dashboard.html
Normal file
@@ -0,0 +1,320 @@
|
|||||||
|
{% extends 'crawler/base.html' %}
|
||||||
|
{% load custom_filters %}
|
||||||
|
|
||||||
|
{% block title %}仪表板 - 网站爬虫系统{% endblock %}
|
||||||
|
|
||||||
|
{% block content %}
|
||||||
|
<div class="row">
|
||||||
|
<div class="col-12">
|
||||||
|
<h1 class="mb-4">
|
||||||
|
<i class="bi bi-speedometer2"></i> 系统仪表板
|
||||||
|
</h1>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- 统计卡片 -->
|
||||||
|
<div class="row mb-4">
|
||||||
|
<div class="col-md-3 mb-3">
|
||||||
|
<div class="card stats-card bg-primary text-white">
|
||||||
|
<div class="card-body">
|
||||||
|
<div class="d-flex justify-content-between">
|
||||||
|
<div>
|
||||||
|
<h4 class="card-title">{{ stats.total_websites }}</h4>
|
||||||
|
<p class="card-text">监控网站</p>
|
||||||
|
</div>
|
||||||
|
<div class="align-self-center">
|
||||||
|
<i class="bi bi-globe fs-1"></i>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="col-md-3 mb-3">
|
||||||
|
<div class="card stats-card bg-success text-white">
|
||||||
|
<div class="card-body">
|
||||||
|
<div class="d-flex justify-content-between">
|
||||||
|
<div>
|
||||||
|
<h4 class="card-title">{{ stats.total_tasks }}</h4>
|
||||||
|
<p class="card-text">爬取任务</p>
|
||||||
|
</div>
|
||||||
|
<div class="align-self-center">
|
||||||
|
<i class="bi bi-list-task fs-1"></i>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="col-md-3 mb-3">
|
||||||
|
<div class="card stats-card bg-info text-white">
|
||||||
|
<div class="card-body">
|
||||||
|
<div class="d-flex justify-content-between">
|
||||||
|
<div>
|
||||||
|
<h4 class="card-title">{{ stats.total_contents }}</h4>
|
||||||
|
<p class="card-text">爬取内容</p>
|
||||||
|
</div>
|
||||||
|
<div class="align-self-center">
|
||||||
|
<i class="bi bi-file-text fs-1"></i>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="col-md-3 mb-3">
|
||||||
|
<div class="card stats-card bg-warning text-white">
|
||||||
|
<div class="card-body">
|
||||||
|
<div class="d-flex justify-content-between">
|
||||||
|
<div>
|
||||||
|
<h4 class="card-title">{{ stats.active_tasks }}</h4>
|
||||||
|
<p class="card-text">运行中任务</p>
|
||||||
|
</div>
|
||||||
|
<div class="align-self-center">
|
||||||
|
<i class="bi bi-arrow-clockwise fs-1"></i>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="row">
|
||||||
|
<!-- 网站筛选和分页控制 -->
|
||||||
|
<div class="col-12 mb-3">
|
||||||
|
<div class="card">
|
||||||
|
<div class="card-body">
|
||||||
|
<div class="row">
|
||||||
|
<div class="col-md-6">
|
||||||
|
<form method="get" class="d-flex">
|
||||||
|
<select name="website" class="form-select me-2" onchange="this.form.submit()">
|
||||||
|
<option value="">所有网站</option>
|
||||||
|
{% for website in stats.websites %}
|
||||||
|
<option value="{{ website.id }}" {% if website.id == stats.selected_website_id %}selected{% endif %}>
|
||||||
|
{{ website.name }} ({{ website.region }})
|
||||||
|
</option>
|
||||||
|
{% endfor %}
|
||||||
|
</select>
|
||||||
|
|
||||||
|
<select name="page_size" class="form-select me-2" onchange="this.form.submit()">
|
||||||
|
<option value="10" {% if stats.page_size == 10 %}selected{% endif %}>10条/页</option>
|
||||||
|
<option value="20" {% if stats.page_size == 20 %}selected{% endif %}>20条/页</option>
|
||||||
|
<option value="50" {% if stats.page_size == 50 %}selected{% endif %}>50条/页</option>
|
||||||
|
<option value="100" {% if stats.page_size == 100 %}selected{% endif %}>100条/页</option>
|
||||||
|
</select>
|
||||||
|
|
||||||
|
<noscript>
|
||||||
|
<button type="submit" class="btn btn-primary">应用</button>
|
||||||
|
</noscript>
|
||||||
|
</form>
|
||||||
|
</div>
|
||||||
|
<div class="col-md-6">
|
||||||
|
<!-- 分页导航 -->
|
||||||
|
{% if stats.page_obj.has_other_pages %}
|
||||||
|
<nav aria-label="页面导航">
|
||||||
|
<ul class="pagination justify-content-end mb-0">
|
||||||
|
{% if stats.page_obj.has_previous %}
|
||||||
|
<li class="page-item">
|
||||||
|
<a class="page-link" href="?page={{ stats.page_obj.previous_page_number }}{% if stats.selected_website_id %}&website={{ stats.selected_website_id }}{% endif %}{% if stats.page_size %}&page_size={{ stats.page_size }}{% endif %}" aria-label="上一页">
|
||||||
|
<span aria-hidden="true">«</span>
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
{% for num in stats.page_obj.paginator.page_range %}
|
||||||
|
{% if stats.page_obj.number == num %}
|
||||||
|
<li class="page-item active">
|
||||||
|
<span class="page-link">{{ num }}</span>
|
||||||
|
</li>
|
||||||
|
{% elif num > stats.page_obj.number|add:'-3' and num < stats.page_obj.number|add:'3' %}
|
||||||
|
<li class="page-item">
|
||||||
|
<a class="page-link" href="?page={{ num }}{% if stats.selected_website_id %}&website={{ stats.selected_website_id }}{% endif %}{% if stats.page_size %}&page_size={{ stats.page_size }}{% endif %}">{{ num }}</a>
|
||||||
|
</li>
|
||||||
|
{% endif %}
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
{% if stats.page_obj.has_next %}
|
||||||
|
<li class="page-item">
|
||||||
|
<a class="page-link" href="?page={{ stats.page_obj.next_page_number }}{% if stats.selected_website_id %}&website={{ stats.selected_website_id }}{% endif %}{% if stats.page_size %}&page_size={{ stats.page_size }}{% endif %}" aria-label="下一页">
|
||||||
|
<span aria-hidden="true">»</span>
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
{% endif %}
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- 按网站分类显示内容 -->
|
||||||
|
<div class="col-md-8">
|
||||||
|
{% for website_name, contents in stats.contents_by_website.items %}
|
||||||
|
<div class="card mb-4">
|
||||||
|
<div class="card-header">
|
||||||
|
<h5 class="card-title mb-0">
|
||||||
|
<i class="bi bi-globe"></i> {{ website_name }}
|
||||||
|
<span class="badge bg-secondary">{{ contents|length }}</span>
|
||||||
|
</h5>
|
||||||
|
</div>
|
||||||
|
<div class="card-body">
|
||||||
|
<div class="list-group list-group-flush">
|
||||||
|
{% for content in contents %}
|
||||||
|
<div class="list-group-item">
|
||||||
|
<div class="d-flex w-100 justify-content-between">
|
||||||
|
<h6 class="mb-1">
|
||||||
|
{% if content.is_local_saved %}
|
||||||
|
<a href="{% url 'preview_crawled_content' content.id %}" target="_blank" class="text-decoration-none">
|
||||||
|
{{ content.title|truncatechars:60 }}
|
||||||
|
</a>
|
||||||
|
{% else %}
|
||||||
|
<a href="{{ content.url }}" target="_blank" class="text-decoration-none">
|
||||||
|
{{ content.title|truncatechars:60 }}
|
||||||
|
</a>
|
||||||
|
{% endif %}
|
||||||
|
</h6>
|
||||||
|
<small class="text-muted">{{ content.created_at|date:"m-d H:i" }}</small>
|
||||||
|
</div>
|
||||||
|
<p class="mb-1 content-preview">{{ content.content|truncatechars:100 }}</p>
|
||||||
|
<div class="d-flex justify-content-between align-items-center">
|
||||||
|
<small class="text-muted">
|
||||||
|
<i class="bi bi-geo-alt"></i> {{ content.website.region }}
|
||||||
|
{% if content.media_files.count > 0 %}
|
||||||
|
| <i class="bi bi-image"></i> {{ content.media_files.count }} 个媒体文件
|
||||||
|
{% endif %}
|
||||||
|
</small>
|
||||||
|
<div>
|
||||||
|
{% for keyword in content.keywords_matched|split:"," %}
|
||||||
|
<span class="keyword-badge">{{ keyword|strip }}</span>
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
{% empty %}
|
||||||
|
<div class="card">
|
||||||
|
<div class="card-body text-center">
|
||||||
|
<p class="text-muted py-3">暂无爬取内容</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
<!-- 分页信息 -->
|
||||||
|
{% if stats.page_obj.has_other_pages %}
|
||||||
|
<div class="card">
|
||||||
|
<div class="card-body">
|
||||||
|
<div class="d-flex justify-content-between align-items-center">
|
||||||
|
<div>
|
||||||
|
显示第 {{ stats.page_obj.start_index }} 到 {{ stats.page_obj.end_index }} 条,共 {{ stats.page_obj.paginator.count }} 条记录
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<!-- 分页导航(重复显示,方便用户操作) -->
|
||||||
|
<nav aria-label="页面导航">
|
||||||
|
<ul class="pagination mb-0">
|
||||||
|
{% if stats.page_obj.has_previous %}
|
||||||
|
<li class="page-item">
|
||||||
|
<a class="page-link" href="?page={{ stats.page_obj.previous_page_number }}{% if stats.selected_website_id %}&website={{ stats.selected_website_id }}{% endif %}{% if stats.page_size %}&page_size={{ stats.page_size }}{% endif %}" aria-label="上一页">
|
||||||
|
<span aria-hidden="true">«</span>
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
{% for num in stats.page_obj.paginator.page_range %}
|
||||||
|
{% if stats.page_obj.number == num %}
|
||||||
|
<li class="page-item active">
|
||||||
|
<span class="page-link">{{ num }}</span>
|
||||||
|
</li>
|
||||||
|
{% elif num > stats.page_obj.number|add:'-3' and num < stats.page_obj.number|add:'3' %}
|
||||||
|
<li class="page-item">
|
||||||
|
<a class="page-link" href="?page={{ num }}{% if stats.selected_website_id %}&website={{ stats.selected_website_id }}{% endif %}{% if stats.page_size %}&page_size={{ stats.page_size }}{% endif %}">{{ num }}</a>
|
||||||
|
</li>
|
||||||
|
{% endif %}
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
{% if stats.page_obj.has_next %}
|
||||||
|
<li class="page-item">
|
||||||
|
<a class="page-link" href="?page={{ stats.page_obj.next_page_number }}{% if stats.selected_website_id %}&website={{ stats.selected_website_id }}{% endif %}{% if stats.page_size %}&page_size={{ stats.page_size }}{% endif %}" aria-label="下一页">
|
||||||
|
<span aria-hidden="true">»</span>
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
{% endif %}
|
||||||
|
</ul>
|
||||||
|
</nav>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- 最近的任务 -->
|
||||||
|
<div class="col-md-4">
|
||||||
|
<div class="card">
|
||||||
|
<div class="card-header">
|
||||||
|
<h5 class="card-title mb-0">
|
||||||
|
<i class="bi bi-list-check"></i> 最近的任务
|
||||||
|
</h5>
|
||||||
|
</div>
|
||||||
|
<div class="card-body">
|
||||||
|
{% if stats.recent_tasks %}
|
||||||
|
<div class="list-group list-group-flush">
|
||||||
|
{% for task in stats.recent_tasks %}
|
||||||
|
<div class="list-group-item">
|
||||||
|
<div class="d-flex w-100 justify-content-between">
|
||||||
|
<h6 class="mb-1">{{ task.name|truncatechars:30 }}</h6>
|
||||||
|
<span class="badge bg-{% if task.status == 'completed' %}success{% elif task.status == 'failed' %}danger{% elif task.status == 'running' %}warning{% else %}secondary{% endif %}">
|
||||||
|
{{ task.get_status_display }}
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
<p class="mb-1">
|
||||||
|
<small class="text-muted">关键字: {{ task.keywords|truncatechars:40 }}</small>
|
||||||
|
</p>
|
||||||
|
<small class="text-muted">{{ task.created_at|date:"m-d H:i" }}</small>
|
||||||
|
</div>
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
{% else %}
|
||||||
|
<p class="text-muted text-center py-3">暂无任务</p>
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- 快速操作 -->
|
||||||
|
<div class="row mt-4">
|
||||||
|
<div class="col-12">
|
||||||
|
<div class="card">
|
||||||
|
<div class="card-header">
|
||||||
|
<h5 class="card-title mb-0">
|
||||||
|
<i class="bi bi-lightning"></i> 快速操作
|
||||||
|
</h5>
|
||||||
|
</div>
|
||||||
|
<div class="card-body">
|
||||||
|
<div class="row">
|
||||||
|
<div class="col-md-4 mb-3">
|
||||||
|
<a href="{% url 'search' %}" class="btn btn-primary w-100">
|
||||||
|
<i class="bi bi-search"></i> 搜索内容
|
||||||
|
</a>
|
||||||
|
</div>
|
||||||
|
<div class="col-md-4 mb-3">
|
||||||
|
<a href="/admin/crawler/crawltask/add/" class="btn btn-success w-100">
|
||||||
|
<i class="bi bi-plus-circle"></i> 创建任务
|
||||||
|
</a>
|
||||||
|
</div>
|
||||||
|
<div class="col-md-4 mb-3">
|
||||||
|
<a href="/admin/" class="btn btn-outline-secondary w-100">
|
||||||
|
<i class="bi bi-gear"></i> 管理后台
|
||||||
|
</a>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
{% endblock %}
|
||||||
128
crawler/templates/crawler/search.html
Normal file
128
crawler/templates/crawler/search.html
Normal file
@@ -0,0 +1,128 @@
|
|||||||
|
{% extends 'crawler/base.html' %}
|
||||||
|
{% load custom_filters %}
|
||||||
|
|
||||||
|
{% block title %}搜索内容 - 网站爬虫系统{% endblock %}
|
||||||
|
|
||||||
|
{% block content %}
|
||||||
|
<div class="row">
|
||||||
|
<div class="col-12">
|
||||||
|
<h1 class="mb-4">
|
||||||
|
<i class="bi bi-search"></i> 内容搜索
|
||||||
|
</h1>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- 搜索表单 -->
|
||||||
|
<div class="row mb-4">
|
||||||
|
<div class="col-12">
|
||||||
|
<div class="card">
|
||||||
|
<div class="card-body">
|
||||||
|
<form method="get" action="{% url 'search' %}">
|
||||||
|
<div class="input-group input-group-lg">
|
||||||
|
<input type="text"
|
||||||
|
class="form-control"
|
||||||
|
name="q"
|
||||||
|
value="{{ keyword }}"
|
||||||
|
placeholder="输入关键字搜索内容..."
|
||||||
|
required>
|
||||||
|
<button class="btn btn-primary" type="submit">
|
||||||
|
<i class="bi bi-search"></i> 搜索
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
</form>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- 搜索结果 -->
|
||||||
|
{% if keyword %}
|
||||||
|
<div class="row">
|
||||||
|
<div class="col-12">
|
||||||
|
<div class="card">
|
||||||
|
<div class="card-header">
|
||||||
|
<h5 class="card-title mb-0">
|
||||||
|
<i class="bi bi-list-ul"></i> 搜索结果
|
||||||
|
{% if contents %}
|
||||||
|
<span class="badge bg-primary ms-2">{{ contents|length }} 条结果</span>
|
||||||
|
{% endif %}
|
||||||
|
</h5>
|
||||||
|
</div>
|
||||||
|
<div class="card-body">
|
||||||
|
{% if contents %}
|
||||||
|
<div class="list-group list-group-flush">
|
||||||
|
{% for content in contents %}
|
||||||
|
<div class="list-group-item">
|
||||||
|
<div class="d-flex w-100 justify-content-between">
|
||||||
|
<h5 class="mb-1">
|
||||||
|
{% if content.is_local_saved %}
|
||||||
|
<a href="{% url 'preview_crawled_content' content.id %}" target="_blank" class="text-decoration-none">
|
||||||
|
{{ content.title }}
|
||||||
|
</a>
|
||||||
|
{% else %}
|
||||||
|
<a href="{{ content.url }}" target="_blank" class="text-decoration-none">
|
||||||
|
{{ content.title }}
|
||||||
|
</a>
|
||||||
|
{% endif %}
|
||||||
|
</h5>
|
||||||
|
<small class="text-muted">{{ content.created_at|date:"Y-m-d H:i" }}</small>
|
||||||
|
</div>
|
||||||
|
<p class="mb-2 content-preview">{{ content.content|truncatechars:200 }}</p>
|
||||||
|
<div class="d-flex justify-content-between align-items-center">
|
||||||
|
<small class="text-muted">
|
||||||
|
<i class="bi bi-geo-alt"></i> {{ content.website.region }} - {{ content.website.name }}
|
||||||
|
{% if content.author %}
|
||||||
|
| <i class="bi bi-person"></i> {{ content.author }}
|
||||||
|
{% endif %}
|
||||||
|
{% if content.publish_date %}
|
||||||
|
| <i class="bi bi-calendar"></i> {{ content.publish_date|date:"Y-m-d" }}
|
||||||
|
{% endif %}
|
||||||
|
{% if content.media_files.count > 0 %}
|
||||||
|
| <i class="bi bi-image"></i> {{ content.media_files.count }} 个媒体文件
|
||||||
|
{% endif %}
|
||||||
|
</small>
|
||||||
|
<div>
|
||||||
|
{% for keyword in content.keywords_matched|split:"," %}
|
||||||
|
<span class="keyword-badge">{{ keyword|strip }}</span>
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
{% else %}
|
||||||
|
<div class="text-center py-5">
|
||||||
|
<i class="bi bi-search fs-1 text-muted"></i>
|
||||||
|
<p class="text-muted mt-3">没有找到包含 "{{ keyword }}" 的内容</p>
|
||||||
|
<p class="text-muted">请尝试其他关键字或检查爬取任务是否正常运行</p>
|
||||||
|
</div>
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
{% else %}
|
||||||
|
<!-- 搜索提示 -->
|
||||||
|
<div class="row">
|
||||||
|
<div class="col-12">
|
||||||
|
<div class="card">
|
||||||
|
<div class="card-body text-center py-5">
|
||||||
|
<i class="bi bi-search fs-1 text-muted"></i>
|
||||||
|
<h4 class="text-muted mt-3">开始搜索</h4>
|
||||||
|
<p class="text-muted">在上方输入框中输入关键字,搜索已爬取的内容</p>
|
||||||
|
<div class="mt-4">
|
||||||
|
<h6>搜索建议:</h6>
|
||||||
|
<div class="d-flex flex-wrap justify-content-center gap-2">
|
||||||
|
<span class="badge bg-light text-dark">反腐败</span>
|
||||||
|
<span class="badge bg-light text-dark">纪律检查</span>
|
||||||
|
<span class="badge bg-light text-dark">监督</span>
|
||||||
|
<span class="badge bg-light text-dark">廉政</span>
|
||||||
|
<span class="badge bg-light text-dark">违纪</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
{% endif %}
|
||||||
|
{% endblock %}
|
||||||
0
crawler/templatetags/__init__.py
Normal file
0
crawler/templatetags/__init__.py
Normal file
32
crawler/templatetags/custom_filters.py
Normal file
32
crawler/templatetags/custom_filters.py
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
from django import template
|
||||||
|
|
||||||
|
register = template.Library()
|
||||||
|
|
||||||
|
|
||||||
|
@register.filter
|
||||||
|
def split(value, separator=','):
|
||||||
|
"""Split a string by separator"""
|
||||||
|
if not value:
|
||||||
|
return []
|
||||||
|
return value.split(separator)
|
||||||
|
|
||||||
|
|
||||||
|
@register.filter
|
||||||
|
def strip(value):
|
||||||
|
"""Strip whitespace from a string"""
|
||||||
|
if not value:
|
||||||
|
return ''
|
||||||
|
return value.strip()
|
||||||
|
|
||||||
|
|
||||||
|
@register.filter
|
||||||
|
def div(value, divisor):
|
||||||
|
"""Divide value by divisor"""
|
||||||
|
try:
|
||||||
|
value = float(value)
|
||||||
|
divisor = float(divisor)
|
||||||
|
if divisor == 0:
|
||||||
|
return 0
|
||||||
|
return value / divisor
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
return 0
|
||||||
3
crawler/tests.py
Normal file
3
crawler/tests.py
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
from django.test import TestCase
|
||||||
|
|
||||||
|
# Create your tests here.
|
||||||
8
crawler/urls.py
Normal file
8
crawler/urls.py
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
from django.urls import path, include
|
||||||
|
from . import views
|
||||||
|
|
||||||
|
urlpatterns = [
|
||||||
|
path('', views.dashboard, name='dashboard'),
|
||||||
|
path('search/', views.search_page, name='search'),
|
||||||
|
path('crawled-content/<int:content_id>/preview/', views.preview_crawled_content, name='preview_crawled_content'),
|
||||||
|
]
|
||||||
292
crawler/views.py
Normal file
292
crawler/views.py
Normal file
@@ -0,0 +1,292 @@
|
|||||||
|
from django.shortcuts import render, get_object_or_404
|
||||||
|
from django.http import HttpResponse
|
||||||
|
from django.db.models import Q, Count
|
||||||
|
from django.conf import settings
|
||||||
|
from django.utils import timezone
|
||||||
|
from .models import Website, CrawlTask, CrawledContent, CrawlLog, SearchKeyword
|
||||||
|
from rest_framework import viewsets, filters
|
||||||
|
from rest_framework.decorators import action
|
||||||
|
from rest_framework.response import Response
|
||||||
|
from .serializers import (
|
||||||
|
WebsiteSerializer, CrawlTaskSerializer, CrawledContentSerializer,
|
||||||
|
CrawlLogSerializer, SearchKeywordSerializer
|
||||||
|
)
|
||||||
|
import json
|
||||||
|
from django.core.paginator import Paginator
|
||||||
|
from django.db.models.functions import TruncDate
|
||||||
|
from django.db.models import Count
|
||||||
|
|
||||||
|
|
||||||
|
def dashboard(request):
|
||||||
|
"""仪表板视图"""
|
||||||
|
# 统计数据
|
||||||
|
total_websites = Website.objects.filter(is_active=True).count()
|
||||||
|
total_tasks = CrawlTask.objects.count()
|
||||||
|
total_contents = CrawledContent.objects.count()
|
||||||
|
active_tasks = CrawlTask.objects.filter(status='running').count()
|
||||||
|
|
||||||
|
# 获取所有网站
|
||||||
|
websites = Website.objects.filter(is_active=True).order_by('name')
|
||||||
|
|
||||||
|
# 获取当前选中的网站ID
|
||||||
|
selected_website_id = request.GET.get('website')
|
||||||
|
|
||||||
|
# 获取分页参数
|
||||||
|
page_number = request.GET.get('page', 1)
|
||||||
|
page_size = request.GET.get('page_size', 20) # 默认每页20篇文章
|
||||||
|
|
||||||
|
# 尝试转换page_size为整数
|
||||||
|
try:
|
||||||
|
page_size = int(page_size)
|
||||||
|
# 限制page_size在合理范围内
|
||||||
|
page_size = max(10, min(100, page_size))
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
page_size = 20
|
||||||
|
|
||||||
|
# 获取所有爬取的内容,按网站和创建时间排序
|
||||||
|
all_contents = CrawledContent.objects.select_related('website').order_by('website__name', '-created_at')
|
||||||
|
|
||||||
|
# 如果选择了特定网站,则进行过滤
|
||||||
|
if selected_website_id:
|
||||||
|
try:
|
||||||
|
selected_website_id = int(selected_website_id)
|
||||||
|
all_contents = all_contents.filter(website_id=selected_website_id)
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
# 分页处理
|
||||||
|
paginator = Paginator(all_contents, page_size)
|
||||||
|
page_obj = paginator.get_page(page_number)
|
||||||
|
|
||||||
|
# 按网站分组内容
|
||||||
|
contents_by_website = {}
|
||||||
|
for content in page_obj:
|
||||||
|
website_name = content.website.name
|
||||||
|
if website_name not in contents_by_website:
|
||||||
|
contents_by_website[website_name] = []
|
||||||
|
contents_by_website[website_name].append(content)
|
||||||
|
|
||||||
|
# 最近的任务
|
||||||
|
recent_tasks = CrawlTask.objects.order_by('-created_at')[:5]
|
||||||
|
|
||||||
|
# 媒体文件统计
|
||||||
|
total_media_files = CrawledContent.objects.aggregate(
|
||||||
|
total_media=Count('media_files')
|
||||||
|
)['total_media'] or 0
|
||||||
|
|
||||||
|
stats = {
|
||||||
|
'total_websites': total_websites,
|
||||||
|
'total_tasks': total_tasks,
|
||||||
|
'total_contents': total_contents,
|
||||||
|
'active_tasks': active_tasks,
|
||||||
|
'websites': websites,
|
||||||
|
'selected_website_id': selected_website_id,
|
||||||
|
'page_obj': page_obj,
|
||||||
|
'contents_by_website': contents_by_website,
|
||||||
|
'page_size': page_size,
|
||||||
|
'recent_tasks': recent_tasks,
|
||||||
|
'total_media_files': total_media_files,
|
||||||
|
}
|
||||||
|
|
||||||
|
return render(request, 'crawler/dashboard.html', {'stats': stats})
|
||||||
|
|
||||||
|
|
||||||
|
def search_page(request):
|
||||||
|
"""搜索页面视图"""
|
||||||
|
keyword = request.GET.get('q', '').strip()
|
||||||
|
contents = []
|
||||||
|
|
||||||
|
if keyword:
|
||||||
|
# 记录搜索关键字
|
||||||
|
SearchKeyword.objects.get_or_create(
|
||||||
|
keyword=keyword,
|
||||||
|
defaults={'last_used': timezone.now()}
|
||||||
|
)
|
||||||
|
|
||||||
|
# 搜索内容
|
||||||
|
contents = CrawledContent.objects.filter(
|
||||||
|
Q(title__icontains=keyword) |
|
||||||
|
Q(content__icontains=keyword) |
|
||||||
|
Q(keywords_matched__icontains=keyword)
|
||||||
|
).order_by('-created_at')[:50]
|
||||||
|
|
||||||
|
return render(request, 'crawler/search.html', {
|
||||||
|
'keyword': keyword,
|
||||||
|
'contents': contents
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
def preview_crawled_content(request, content_id):
|
||||||
|
"""预览爬取的内容"""
|
||||||
|
content = get_object_or_404(CrawledContent, id=content_id)
|
||||||
|
|
||||||
|
# 获取媒体文件
|
||||||
|
media_files = content.media_files.all()
|
||||||
|
|
||||||
|
# 生成媒体文件HTML
|
||||||
|
media_section = ""
|
||||||
|
if media_files:
|
||||||
|
media_section = """
|
||||||
|
<div class="media-section">
|
||||||
|
<h3>媒体文件</h3>
|
||||||
|
"""
|
||||||
|
for media_file in media_files:
|
||||||
|
if media_file.media_type == 'image':
|
||||||
|
media_section += f"""
|
||||||
|
<div class="media-item">
|
||||||
|
<h4>图片: {media_file.alt_text or '无标题'}</h4>
|
||||||
|
<img src="/media/{media_file.local_file.name}" alt="{media_file.alt_text}" style="max-width: 100%; height: auto;">
|
||||||
|
<p><small>原始URL: <a href="{media_file.original_url}" target="_blank">{media_file.original_url}</a></small></p>
|
||||||
|
<p><small>文件大小: {media_file.file_size_display}</small></p>
|
||||||
|
</div>
|
||||||
|
"""
|
||||||
|
elif media_file.media_type == 'video':
|
||||||
|
media_section += f"""
|
||||||
|
<div class="media-item">
|
||||||
|
<h4>视频</h4>
|
||||||
|
<video controls style="max-width: 100%;">
|
||||||
|
<source src="/media/{media_file.local_file.name}" type="{media_file.mime_type}">
|
||||||
|
您的浏览器不支持视频播放。
|
||||||
|
</video>
|
||||||
|
<p><small>原始URL: <a href="{media_file.original_url}" target="_blank">{media_file.original_url}</a></small></p>
|
||||||
|
<p><small>文件大小: {media_file.file_size_display}</small></p>
|
||||||
|
</div>
|
||||||
|
"""
|
||||||
|
elif media_file.media_type == 'audio':
|
||||||
|
media_section += f"""
|
||||||
|
<div class="media-item">
|
||||||
|
<h4>音频</h4>
|
||||||
|
<audio controls>
|
||||||
|
<source src="/media/{media_file.local_file.name}" type="{media_file.mime_type}">
|
||||||
|
您的浏览器不支持音频播放。
|
||||||
|
</audio>
|
||||||
|
<p><small>原始URL: <a href="{media_file.original_url}" target="_blank">{media_file.original_url}</a></small></p>
|
||||||
|
<p><small>文件大小: {media_file.file_size_display}</small></p>
|
||||||
|
</div>
|
||||||
|
"""
|
||||||
|
else:
|
||||||
|
media_section += f"""
|
||||||
|
<div class="media-item">
|
||||||
|
<h4>文件: {media_file.get_media_type_display()}</h4>
|
||||||
|
<p><a href="/media/{media_file.local_file.name}" download>下载文件</a></p>
|
||||||
|
<p><small>原始URL: <a href="{media_file.original_url}" target="_blank">{media_file.original_url}</a></small></p>
|
||||||
|
<p><small>文件大小: {media_file.file_size_display}</small></p>
|
||||||
|
</div>
|
||||||
|
"""
|
||||||
|
media_section += " </div>"
|
||||||
|
|
||||||
|
# 处理内容格式,将换行符转换为段落和<br>标签
|
||||||
|
formatted_content = content.content.replace('\n\n', '</p><p>').replace('\n', '<br>')
|
||||||
|
|
||||||
|
# 动态生成预览页面
|
||||||
|
html_content = f"""
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<title>{content.title}</title>
|
||||||
|
<style>
|
||||||
|
body {{
|
||||||
|
font-family: Arial, sans-serif;
|
||||||
|
margin: 40px;
|
||||||
|
line-height: 1.6;
|
||||||
|
max-width: 1200px;
|
||||||
|
margin: 40px auto;
|
||||||
|
}}
|
||||||
|
h1 {{ color: #333; margin-bottom: 20px; }}
|
||||||
|
.meta {{
|
||||||
|
color: #666;
|
||||||
|
margin-bottom: 30px;
|
||||||
|
padding: 20px;
|
||||||
|
background-color: #f8f9fa;
|
||||||
|
border-radius: 8px;
|
||||||
|
border-left: 4px solid #007bff;
|
||||||
|
}}
|
||||||
|
.content {{
|
||||||
|
line-height: 1.8;
|
||||||
|
font-size: 16px;
|
||||||
|
margin-bottom: 30px;
|
||||||
|
}}
|
||||||
|
.content p {{
|
||||||
|
margin-bottom: 1em;
|
||||||
|
}}
|
||||||
|
.media-section {{
|
||||||
|
margin-top: 30px;
|
||||||
|
padding: 20px;
|
||||||
|
background-color: #f8f9fa;
|
||||||
|
border-radius: 8px;
|
||||||
|
}}
|
||||||
|
.media-item {{
|
||||||
|
margin-bottom: 20px;
|
||||||
|
padding: 15px;
|
||||||
|
border: 1px solid #ddd;
|
||||||
|
border-radius: 5px;
|
||||||
|
background-color: white;
|
||||||
|
}}
|
||||||
|
.media-item h4 {{
|
||||||
|
margin-top: 0;
|
||||||
|
color: #555;
|
||||||
|
border-bottom: 1px solid #eee;
|
||||||
|
padding-bottom: 10px;
|
||||||
|
}}
|
||||||
|
.back-link {{
|
||||||
|
margin-bottom: 20px;
|
||||||
|
}}
|
||||||
|
.back-link a {{
|
||||||
|
color: #007bff;
|
||||||
|
text-decoration: none;
|
||||||
|
font-weight: bold;
|
||||||
|
padding: 8px 16px;
|
||||||
|
background-color: #f8f9fa;
|
||||||
|
border: 1px solid #ddd;
|
||||||
|
border-radius: 4px;
|
||||||
|
}}
|
||||||
|
.back-link a:hover {{
|
||||||
|
text-decoration: underline;
|
||||||
|
background-color: #e9ecef;
|
||||||
|
}}
|
||||||
|
.navbar {{
|
||||||
|
background-color: #007bff;
|
||||||
|
padding: 15px;
|
||||||
|
margin-bottom: 30px;
|
||||||
|
border-radius: 8px;
|
||||||
|
}}
|
||||||
|
.navbar a {{
|
||||||
|
color: white;
|
||||||
|
text-decoration: none;
|
||||||
|
margin-right: 20px;
|
||||||
|
font-weight: bold;
|
||||||
|
}}
|
||||||
|
.navbar a:hover {{
|
||||||
|
text-decoration: underline;
|
||||||
|
}}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div class="navbar">
|
||||||
|
<a href="/">仪表板</a>
|
||||||
|
<a href="/admin/crawler/crawledcontent/">管理界面</a>
|
||||||
|
<a href="javascript:history.back()">← 返回</a>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h1>{content.title}</h1>
|
||||||
|
|
||||||
|
<div class="meta">
|
||||||
|
<p><strong>来源网站:</strong> {content.website.name} ({content.website.region})</p>
|
||||||
|
<p><strong>原始链接:</strong> <a href="{content.url}" target="_blank">{content.url}</a></p>
|
||||||
|
<p><strong>发布时间:</strong> {content.publish_date or '未知'}</p>
|
||||||
|
<p><strong>作者:</strong> {content.author or '未知'}</p>
|
||||||
|
<p><strong>匹配关键字:</strong> {content.keywords_matched}</p>
|
||||||
|
<p><strong>爬取时间:</strong> {content.created_at}</p>
|
||||||
|
<p><strong>媒体文件数量:</strong> {len(media_files)}</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="content">
|
||||||
|
<p>{formatted_content}</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{media_section}
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
return HttpResponse(html_content, content_type='text/html; charset=utf-8')
|
||||||
4
crawler_project/__init__.py
Normal file
4
crawler_project/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
# 这将确保Celery应用在Django启动时被加载
|
||||||
|
from .celery import app as celery_app
|
||||||
|
|
||||||
|
__all__ = ('celery_app',)
|
||||||
16
crawler_project/asgi.py
Normal file
16
crawler_project/asgi.py
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
"""
|
||||||
|
ASGI config for crawler_project project.
|
||||||
|
|
||||||
|
It exposes the ASGI callable as a module-level variable named ``application``.
|
||||||
|
|
||||||
|
For more information on this file, see
|
||||||
|
https://docs.djangoproject.com/en/5.2/howto/deployment/asgi/
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
from django.core.asgi import get_asgi_application
|
||||||
|
|
||||||
|
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'crawler_project.settings')
|
||||||
|
|
||||||
|
application = get_asgi_application()
|
||||||
17
crawler_project/celery.py
Normal file
17
crawler_project/celery.py
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
import os
|
||||||
|
from celery import Celery
|
||||||
|
|
||||||
|
# 设置Django设置模块
|
||||||
|
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'crawler_project.settings')
|
||||||
|
|
||||||
|
app = Celery('crawler_project')
|
||||||
|
|
||||||
|
# 使用Django设置文件配置Celery
|
||||||
|
app.config_from_object('django.conf:settings', namespace='CELERY')
|
||||||
|
|
||||||
|
# 自动发现任务
|
||||||
|
app.autodiscover_tasks()
|
||||||
|
|
||||||
|
@app.task(bind=True)
|
||||||
|
def debug_task(self):
|
||||||
|
print(f'Request: {self.request!r}')
|
||||||
181
crawler_project/settings.py
Normal file
181
crawler_project/settings.py
Normal file
@@ -0,0 +1,181 @@
|
|||||||
|
"""
|
||||||
|
Django settings for crawler_project project.
|
||||||
|
|
||||||
|
Generated by 'django-admin startproject' using Django 5.2.6.
|
||||||
|
|
||||||
|
For more information on this file, see
|
||||||
|
https://docs.djangoproject.com/en/5.2/topics/settings/
|
||||||
|
|
||||||
|
For the full list of settings and their values, see
|
||||||
|
https://docs.djangoproject.com/en/5.2/ref/settings/
|
||||||
|
"""
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Build paths inside the project like this: BASE_DIR / 'subdir'.
|
||||||
|
BASE_DIR = Path(__file__).resolve().parent.parent
|
||||||
|
|
||||||
|
|
||||||
|
# Quick-start development settings - unsuitable for production
|
||||||
|
# See https://docs.djangoproject.com/en/5.2/howto/deployment/checklist/
|
||||||
|
|
||||||
|
# SECURITY WARNING: keep the secret key used in production secret!
|
||||||
|
SECRET_KEY = 'django-insecure-w5lm159dl-)=z!dysfxf8!n^o26^6)4^!@5(yp*5-_c=!_tcq!'
|
||||||
|
|
||||||
|
# SECURITY WARNING: don't run with debug turned on in production!
|
||||||
|
DEBUG = True
|
||||||
|
|
||||||
|
ALLOWED_HOSTS = []
|
||||||
|
|
||||||
|
|
||||||
|
# Application definition
|
||||||
|
|
||||||
|
INSTALLED_APPS = [
|
||||||
|
'django.contrib.admin',
|
||||||
|
'django.contrib.auth',
|
||||||
|
'django.contrib.contenttypes',
|
||||||
|
'django.contrib.sessions',
|
||||||
|
'django.contrib.messages',
|
||||||
|
'django.contrib.staticfiles',
|
||||||
|
'crawler',
|
||||||
|
'rest_framework',
|
||||||
|
]
|
||||||
|
|
||||||
|
MIDDLEWARE = [
|
||||||
|
'django.middleware.security.SecurityMiddleware',
|
||||||
|
'django.contrib.sessions.middleware.SessionMiddleware',
|
||||||
|
'django.middleware.common.CommonMiddleware',
|
||||||
|
'django.middleware.csrf.CsrfViewMiddleware',
|
||||||
|
'django.contrib.auth.middleware.AuthenticationMiddleware',
|
||||||
|
'django.contrib.messages.middleware.MessageMiddleware',
|
||||||
|
'django.middleware.clickjacking.XFrameOptionsMiddleware',
|
||||||
|
]
|
||||||
|
|
||||||
|
ROOT_URLCONF = 'crawler_project.urls'
|
||||||
|
|
||||||
|
TEMPLATES = [
|
||||||
|
{
|
||||||
|
'BACKEND': 'django.template.backends.django.DjangoTemplates',
|
||||||
|
'DIRS': [],
|
||||||
|
'APP_DIRS': True,
|
||||||
|
'OPTIONS': {
|
||||||
|
'context_processors': [
|
||||||
|
'django.template.context_processors.request',
|
||||||
|
'django.contrib.auth.context_processors.auth',
|
||||||
|
'django.contrib.messages.context_processors.messages',
|
||||||
|
],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
WSGI_APPLICATION = 'crawler_project.wsgi.application'
|
||||||
|
|
||||||
|
|
||||||
|
# Database
|
||||||
|
# https://docs.djangoproject.com/en/5.2/ref/settings/#databases
|
||||||
|
|
||||||
|
DATABASES = {
|
||||||
|
'default': {
|
||||||
|
'ENGINE': 'django.db.backends.sqlite3',
|
||||||
|
'NAME': BASE_DIR / 'db.sqlite3',
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# Password validation
|
||||||
|
# https://docs.djangoproject.com/en/5.2/ref/settings/#auth-password-validators
|
||||||
|
|
||||||
|
AUTH_PASSWORD_VALIDATORS = [
|
||||||
|
{
|
||||||
|
'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# Internationalization
|
||||||
|
# https://docs.djangoproject.com/en/5.2/topics/i18n/
|
||||||
|
|
||||||
|
LANGUAGE_CODE = 'zh-hans'
|
||||||
|
|
||||||
|
TIME_ZONE = 'Asia/Shanghai'
|
||||||
|
|
||||||
|
USE_I18N = True
|
||||||
|
|
||||||
|
USE_TZ = True
|
||||||
|
|
||||||
|
|
||||||
|
# Static files (CSS, JavaScript, Images)
|
||||||
|
# https://docs.djangoproject.com/en/5.2/howto/static-files/
|
||||||
|
|
||||||
|
STATIC_URL = 'static/'
|
||||||
|
|
||||||
|
# Media files (用户上传的文件)
|
||||||
|
MEDIA_URL = '/media/'
|
||||||
|
MEDIA_ROOT = BASE_DIR / 'media'
|
||||||
|
|
||||||
|
# Default primary key field type
|
||||||
|
# https://docs.djangoproject.com/en/5.2/ref/settings/#default-auto-field
|
||||||
|
|
||||||
|
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
|
||||||
|
|
||||||
|
# Celery配置
|
||||||
|
CELERY_BROKER_URL = 'redis://localhost:6379/0'
|
||||||
|
CELERY_RESULT_BACKEND = 'redis://localhost:6379/0'
|
||||||
|
CELERY_ACCEPT_CONTENT = ['json']
|
||||||
|
CELERY_TASK_SERIALIZER = 'json'
|
||||||
|
CELERY_RESULT_SERIALIZER = 'json'
|
||||||
|
CELERY_TIMEZONE = TIME_ZONE
|
||||||
|
|
||||||
|
# 爬虫配置
|
||||||
|
CRAWLER_SETTINGS = {
|
||||||
|
'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||||||
|
'REQUEST_DELAY': 1, # 请求间隔(秒)
|
||||||
|
'TIMEOUT': 30, # 请求超时时间
|
||||||
|
'MAX_RETRIES': 3, # 最大重试次数
|
||||||
|
}
|
||||||
|
|
||||||
|
# 目标网站列表
|
||||||
|
TARGET_WEBSITES = [
|
||||||
|
{'name': '中共中央纪委', 'url': 'https://www.ccdi.gov.cn/', 'region': '中央'},
|
||||||
|
{'name': '北京纪检监察', 'url': 'https://www.bjsupervision.gov.cn/', 'region': '北京'},
|
||||||
|
{'name': '天津纪检监察', 'url': 'https://www.tjjw.gov.cn/', 'region': '天津'},
|
||||||
|
{'name': '河北纪检监察', 'url': 'http://www.hebcdi.gov.cn/', 'region': '河北'},
|
||||||
|
{'name': '山西纪检监察', 'url': 'http://www.sxdi.gov.cn/', 'region': '山西'},
|
||||||
|
{'name': '内蒙古纪检监察', 'url': 'https://www.nmgjjjc.gov.cn/', 'region': '内蒙古'},
|
||||||
|
{'name': '辽宁纪检监察', 'url': 'https://www.lnsjjjc.gov.cn/', 'region': '辽宁'},
|
||||||
|
{'name': '吉林纪检监察', 'url': 'http://ccdijl.gov.cn/', 'region': '吉林'},
|
||||||
|
{'name': '黑龙江纪检监察', 'url': 'https://www.hljjjjc.gov.cn/Hljjjjc/', 'region': '黑龙江'},
|
||||||
|
{'name': '上海纪检监察', 'url': 'https://www.shjjjc.gov.cn/', 'region': '上海'},
|
||||||
|
{'name': '江苏纪检监察', 'url': 'https://www.jssjw.gov.cn/', 'region': '江苏'},
|
||||||
|
{'name': '浙江纪检监察', 'url': 'https://www.zjsjw.gov.cn/shouye/', 'region': '浙江'},
|
||||||
|
{'name': '安徽纪检监察', 'url': 'http://www.ahjjjc.gov.cn/', 'region': '安徽'},
|
||||||
|
{'name': '福建纪检监察', 'url': 'https://www.fjcdi.gov.cn/cms/html/fjsjwjw/index.html', 'region': '福建'},
|
||||||
|
{'name': '江西纪检监察', 'url': 'http://www.jxdi.gov.cn/', 'region': '江西'},
|
||||||
|
{'name': '山东纪检监察', 'url': 'https://www.sdjj.gov.cn/', 'region': '山东'},
|
||||||
|
{'name': '河南纪检监察', 'url': 'https://www.hnsjw.gov.cn/sitesources/hnsjct/page_pc/index.html', 'region': '河南'},
|
||||||
|
{'name': '湖北纪检监察', 'url': 'https://www.hbjwjc.gov.cn/', 'region': '湖北'},
|
||||||
|
{'name': '湖南纪检监察', 'url': 'https://www.sxfj.gov.cn/', 'region': '湖南'},
|
||||||
|
{'name': '广东纪检监察', 'url': 'https://www.gdjct.gd.gov.cn/', 'region': '广东'},
|
||||||
|
{'name': '广西纪检监察', 'url': 'https://www.gxjjw.gov.cn/index.shtml', 'region': '广西'},
|
||||||
|
{'name': '海南纪检监察', 'url': 'https://www.hncdi.gov.cn/web/hnlzw/v2/html/index.jsp', 'region': '海南'},
|
||||||
|
{'name': '重庆纪检监察', 'url': 'https://jjc.cq.gov.cn/', 'region': '重庆'},
|
||||||
|
{'name': '四川纪检监察', 'url': 'https://www.scjc.gov.cn/', 'region': '四川'},
|
||||||
|
{'name': '贵州纪检监察', 'url': 'http://www.gzdis.gov.cn/', 'region': '贵州'},
|
||||||
|
{'name': '云南纪检监察', 'url': 'http://www.ynjjjc.gov.cn/', 'region': '云南'},
|
||||||
|
{'name': '西藏纪检监察', 'url': 'http://www.xzjjw.gov.cn/', 'region': '西藏'},
|
||||||
|
{'name': '陕西纪检监察', 'url': 'https://www.qinfeng.gov.cn/', 'region': '陕西'},
|
||||||
|
{'name': '甘肃纪检监察', 'url': 'http://www.gsjw.gov.cn/', 'region': '甘肃'},
|
||||||
|
{'name': '青海纪检监察', 'url': 'http://www.nxjjjc.gov.cn/', 'region': '青海'},
|
||||||
|
{'name': '宁夏纪检监察', 'url': 'http://www.qhjc.gov.cn/', 'region': '宁夏'},
|
||||||
|
{'name': '新疆纪检监察', 'url': 'https://www.xjjw.gov.cn/', 'region': '新疆'},
|
||||||
|
{'name': '新疆兵团纪检监察', 'url': 'http://btjw.xjbt.gov.cn/', 'region': '新疆兵团'},
|
||||||
|
]
|
||||||
29
crawler_project/urls.py
Normal file
29
crawler_project/urls.py
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
"""
|
||||||
|
URL configuration for crawler_project project.
|
||||||
|
|
||||||
|
The `urlpatterns` list routes URLs to views. For more information please see:
|
||||||
|
https://docs.djangoproject.com/en/5.2/topics/http/urls/
|
||||||
|
Examples:
|
||||||
|
Function views
|
||||||
|
1. Add an import: from my_app import views
|
||||||
|
2. Add a URL to urlpatterns: path('', views.home, name='home')
|
||||||
|
Class-based views
|
||||||
|
1. Add an import: from other_app.views import Home
|
||||||
|
2. Add a URL to urlpatterns: path('', Home.as_view(), name='home')
|
||||||
|
Including another URLconf
|
||||||
|
1. Import the include() function: from django.urls import include, path
|
||||||
|
2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
|
||||||
|
"""
|
||||||
|
from django.contrib import admin
|
||||||
|
from django.urls import path, include
|
||||||
|
from django.conf import settings
|
||||||
|
from django.conf.urls.static import static
|
||||||
|
|
||||||
|
urlpatterns = [
|
||||||
|
path('admin/', admin.site.urls),
|
||||||
|
path('', include('crawler.urls')),
|
||||||
|
]
|
||||||
|
|
||||||
|
if settings.DEBUG:
|
||||||
|
urlpatterns += static(settings.STATIC_URL, document_root=settings.STATIC_ROOT)
|
||||||
|
urlpatterns += static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT)
|
||||||
16
crawler_project/wsgi.py
Normal file
16
crawler_project/wsgi.py
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
"""
|
||||||
|
WSGI config for crawler_project project.
|
||||||
|
|
||||||
|
It exposes the WSGI callable as a module-level variable named ``application``.
|
||||||
|
|
||||||
|
For more information on this file, see
|
||||||
|
https://docs.djangoproject.com/en/5.2/howto/deployment/wsgi/
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
from django.core.wsgi import get_wsgi_application
|
||||||
|
|
||||||
|
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'crawler_project.settings')
|
||||||
|
|
||||||
|
application = get_wsgi_application()
|
||||||
22
manage.py
Executable file
22
manage.py
Executable file
@@ -0,0 +1,22 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
"""Django's command-line utility for administrative tasks."""
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Run administrative tasks."""
|
||||||
|
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'crawler_project.settings')
|
||||||
|
try:
|
||||||
|
from django.core.management import execute_from_command_line
|
||||||
|
except ImportError as exc:
|
||||||
|
raise ImportError(
|
||||||
|
"Couldn't import Django. Are you sure it's installed and "
|
||||||
|
"available on your PYTHONPATH environment variable? Did you "
|
||||||
|
"forget to activate a virtual environment?"
|
||||||
|
) from exc
|
||||||
|
execute_from_command_line(sys.argv)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
29
requirements.txt
Normal file
29
requirements.txt
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
amqp==5.3.1
|
||||||
|
asgiref==3.9.1
|
||||||
|
beautifulsoup4==4.13.5
|
||||||
|
billiard==4.2.2
|
||||||
|
celery==5.5.3
|
||||||
|
certifi==2025.8.3
|
||||||
|
charset-normalizer==3.4.3
|
||||||
|
click==8.3.0
|
||||||
|
click-didyoumean==0.3.1
|
||||||
|
click-plugins==1.1.1.2
|
||||||
|
click-repl==0.3.0
|
||||||
|
Django==5.2.6
|
||||||
|
djangorestframework==3.15.2
|
||||||
|
idna==3.10
|
||||||
|
kombu==5.5.4
|
||||||
|
lxml==6.0.2
|
||||||
|
packaging==25.0
|
||||||
|
prompt_toolkit==3.0.52
|
||||||
|
python-dateutil==2.9.0.post0
|
||||||
|
redis==6.4.0
|
||||||
|
requests==2.32.5
|
||||||
|
six==1.17.0
|
||||||
|
soupsieve==2.8
|
||||||
|
sqlparse==0.5.3
|
||||||
|
typing_extensions==4.15.0
|
||||||
|
tzdata==2025.2
|
||||||
|
urllib3==2.5.0
|
||||||
|
vine==5.1.0
|
||||||
|
wcwidth==0.2.14
|
||||||
49
start.sh
Executable file
49
start.sh
Executable file
@@ -0,0 +1,49 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
echo "启动网站爬虫系统..."
|
||||||
|
|
||||||
|
# 检查是否在正确的目录
|
||||||
|
if [ ! -f "manage.py" ]; then
|
||||||
|
echo "错误: 请在项目根目录运行此脚本"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 检查Python环境
|
||||||
|
if ! command -v python3 &> /dev/null; then
|
||||||
|
echo "错误: 未找到Python3"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 安装依赖
|
||||||
|
echo "安装依赖..."
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
# 数据库迁移
|
||||||
|
echo "执行数据库迁移..."
|
||||||
|
python3 manage.py makemigrations
|
||||||
|
python3 manage.py migrate
|
||||||
|
|
||||||
|
# 初始化网站数据
|
||||||
|
echo "初始化网站数据..."
|
||||||
|
python3 manage.py init_websites
|
||||||
|
|
||||||
|
# 创建超级用户(如果不存在)
|
||||||
|
echo "检查超级用户..."
|
||||||
|
python3 manage.py shell -c "
|
||||||
|
from django.contrib.auth import get_user_model
|
||||||
|
User = get_user_model()
|
||||||
|
if not User.objects.filter(username='admin').exists():
|
||||||
|
User.objects.create_superuser('admin', 'admin@example.com', 'admin123')
|
||||||
|
print('创建超级用户: admin/admin123')
|
||||||
|
else:
|
||||||
|
print('超级用户已存在')
|
||||||
|
"
|
||||||
|
|
||||||
|
echo "启动Django服务器..."
|
||||||
|
echo "访问地址: http://localhost:8000"
|
||||||
|
echo "管理后台: http://localhost:8000/admin"
|
||||||
|
echo "用户名: admin, 密码: admin123"
|
||||||
|
echo ""
|
||||||
|
echo "按 Ctrl+C 停止服务器"
|
||||||
|
|
||||||
|
python3 manage.py runserver 0.0.0.0:8000
|
||||||
12
start_celery.sh
Executable file
12
start_celery.sh
Executable file
@@ -0,0 +1,12 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# 启动Celery Worker
|
||||||
|
echo "启动Celery Worker..."
|
||||||
|
celery -A crawler_project worker --loglevel=info --concurrency=4 &
|
||||||
|
|
||||||
|
# 启动Celery Beat (定时任务)
|
||||||
|
echo "启动Celery Beat..."
|
||||||
|
celery -A crawler_project beat --loglevel=info &
|
||||||
|
|
||||||
|
echo "Celery服务已启动"
|
||||||
|
echo "Worker PID: $!"
|
||||||
Reference in New Issue
Block a user