Compare commits
39 Commits
130999364f
...
develop
| Author | SHA1 | Date | |
|---|---|---|---|
| a36d730384 | |||
| 499454ff27 | |||
| bf927dc77c | |||
| 81a17132e2 | |||
| 8592833d74 | |||
| a4891b1c30 | |||
| 922a88048b | |||
| 100a0cd042 | |||
| 514197d5b3 | |||
| 31fe69535c | |||
| 1b947158a9 | |||
| 46f9ff87f1 | |||
| 193894fcb4 | |||
| 4945b4c6b0 | |||
| e82b85f4dd | |||
| c4dfc515f7 | |||
| 8db0512a6a | |||
| d3760c5780 | |||
| 490cc835d2 | |||
| 99660f4218 | |||
| b9c31a4da1 | |||
| 2fe9e40840 | |||
| 0aff839ed2 | |||
| 8405bd2402 | |||
| 651964ebfc | |||
| e71e7e7eb3 | |||
| 7e6325c68e | |||
| d64bf93988 | |||
| 83d1b21686 | |||
| 7b16c384d3 | |||
| e04a611dbc | |||
| 1856f3e9fc | |||
| 89909d2781 | |||
| ac98ac0057 | |||
| 4994310f14 | |||
| 31d0525cd0 | |||
| c618528a0a | |||
| 5e396796ca | |||
| baea50bfa0 |
6
.gitignore
vendored
6
.gitignore
vendored
@@ -180,5 +180,11 @@ cython_debug/
|
||||
#
|
||||
#####################################
|
||||
|
||||
# 数据目录
|
||||
data/
|
||||
date/media/
|
||||
|
||||
# 配置文件
|
||||
config/
|
||||
.env
|
||||
|
||||
|
||||
73
Dockerfile
Normal file
73
Dockerfile
Normal file
@@ -0,0 +1,73 @@
|
||||
# 使用Python 3.12官方镜像
|
||||
FROM python:3.12-slim
|
||||
|
||||
# 设置环境变量
|
||||
ENV PYTHONDONTWRITEBYTECODE=1
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
ENV DJANGO_SETTINGS_MODULE=green_classroom.settings
|
||||
|
||||
# 设置工作目录
|
||||
WORKDIR /app
|
||||
|
||||
# 安装系统依赖
|
||||
RUN apt-get update && apt-get install -y \
|
||||
gcc \
|
||||
g++ \
|
||||
libpq-dev \
|
||||
curl \
|
||||
wget \
|
||||
gnupg \
|
||||
unzip \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# 安装Chrome和ChromeDriver
|
||||
RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
|
||||
&& echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list \
|
||||
&& apt-get update \
|
||||
&& apt-get install -y google-chrome-stable \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# 下载ChromeDriver
|
||||
RUN CHROME_VERSION=$(google-chrome --version | awk '{print $3}' | awk -F'.' '{print $1}') \
|
||||
&& wget -q "https://chromedriver.storage.googleapis.com/LATEST_RELEASE_${CHROME_VERSION}" -O /tmp/chromedriver_version \
|
||||
&& CHROMEDRIVER_VERSION=$(cat /tmp/chromedriver_version) \
|
||||
&& wget -q "https://chromedriver.storage.googleapis.com/${CHROMEDRIVER_VERSION}/chromedriver_linux64.zip" -O /tmp/chromedriver.zip \
|
||||
&& unzip /tmp/chromedriver.zip -d /usr/local/bin/ \
|
||||
&& rm /tmp/chromedriver.zip /tmp/chromedriver_version \
|
||||
&& chmod +x /usr/local/bin/chromedriver
|
||||
|
||||
# 复制requirements.txt
|
||||
COPY requirements.txt .
|
||||
|
||||
# 安装Python依赖
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# 复制项目文件
|
||||
COPY . .
|
||||
|
||||
# 创建必要的目录
|
||||
RUN mkdir -p /app/data/logs /app/data/static /app/data/media
|
||||
|
||||
# 收集静态文件
|
||||
RUN python manage.py collectstatic --noinput
|
||||
|
||||
# 暴露端口
|
||||
EXPOSE 8000
|
||||
|
||||
# 创建启动脚本
|
||||
RUN echo '#!/bin/bash\n\
|
||||
if [ "$1" = "celery" ]; then\n\
|
||||
exec celery -A green_classroom worker --loglevel=info\n\
|
||||
elif [ "$1" = "celery-beat" ]; then\n\
|
||||
exec celery -A green_classroom beat --loglevel=info\n\
|
||||
elif [ "$1" = "flower" ]; then\n\
|
||||
exec celery -A green_classroom flower\n\
|
||||
elif [ "$1" = "gunicorn" ]; then\n\
|
||||
exec gunicorn green_classroom.asgi:application -b 0.0.0.0:8000 --worker-class uvicorn.workers.UvicornWorker --workers 2\n\
|
||||
else\n\
|
||||
exec python manage.py runserver 0.0.0.0:8000\n\
|
||||
fi' > /app/entrypoint.sh && chmod +x /app/entrypoint.sh
|
||||
|
||||
# 设置入口点
|
||||
ENTRYPOINT ["/app/entrypoint.sh"]
|
||||
CMD ["runserver"]
|
||||
349
core/admin.py
349
core/admin.py
@@ -1,349 +0,0 @@
|
||||
from django.contrib import admin
|
||||
from django.contrib.admin import AdminSite
|
||||
from .models import Website, Article
|
||||
# 添加actions相关的导入
|
||||
from django.contrib import messages
|
||||
from django.http import HttpResponseRedirect
|
||||
# 添加导出功能所需导入
|
||||
import csv
|
||||
from django.http import HttpResponse
|
||||
import json
|
||||
|
||||
|
||||
# 创建自定义管理站点
|
||||
class NewsCnAdminSite(AdminSite):
|
||||
site_header = "新华网管理后台"
|
||||
site_title = "新华网管理"
|
||||
index_title = "新华网内容管理"
|
||||
|
||||
|
||||
class DongfangyancaoAdminSite(AdminSite):
|
||||
site_header = "东方烟草报管理后台"
|
||||
site_title = "东方烟草报管理"
|
||||
index_title = "东方烟草报内容管理"
|
||||
|
||||
|
||||
# 实例化管理站点
|
||||
news_cn_admin = NewsCnAdminSite(name='news_cn_admin')
|
||||
dongfangyancao_admin = DongfangyancaoAdminSite(name='dongfangyancao_admin')
|
||||
|
||||
|
||||
@admin.register(Website)
|
||||
class WebsiteAdmin(admin.ModelAdmin):
|
||||
list_display = ('name', 'base_url', 'enabled')
|
||||
|
||||
|
||||
# 为ArticleAdmin添加自定义动作
|
||||
@admin.register(Article)
|
||||
class ArticleAdmin(admin.ModelAdmin):
|
||||
list_display = ('title', 'website', 'pub_date')
|
||||
search_fields = ('title', 'content')
|
||||
# 添加动作选项
|
||||
actions = ['delete_selected_articles', 'delete_dongfangyancao_articles', 'export_as_csv', 'export_as_json',
|
||||
'export_as_word']
|
||||
|
||||
def delete_dongfangyancao_articles(self, request, queryset):
|
||||
"""一键删除东方烟草报的所有文章"""
|
||||
# 获取东方烟草报网站对象
|
||||
try:
|
||||
dongfangyancao_website = Website.objects.get(name='东方烟草报')
|
||||
# 删除所有东方烟草报的文章
|
||||
deleted_count = Article.objects.filter(website=dongfangyancao_website).delete()[0]
|
||||
self.message_user(request, f"成功删除 {deleted_count} 篇东方烟草报文章", messages.SUCCESS)
|
||||
except Website.DoesNotExist:
|
||||
self.message_user(request, "未找到东方烟草报网站配置", messages.ERROR)
|
||||
|
||||
# 设置动作的显示名称
|
||||
delete_dongfangyancao_articles.short_description = "删除所有东方烟草报文章"
|
||||
|
||||
def export_as_csv(self, request, queryset):
|
||||
"""导出选中的文章为CSV格式"""
|
||||
meta = self.model._meta
|
||||
field_names = [field.name for field in meta.fields]
|
||||
|
||||
response = HttpResponse(content_type='text/csv')
|
||||
response['Content-Disposition'] = 'attachment; filename={}.csv'.format(meta)
|
||||
writer = csv.writer(response)
|
||||
|
||||
writer.writerow(field_names)
|
||||
for obj in queryset:
|
||||
row = [getattr(obj, field)() if callable(getattr(obj, field)) else getattr(obj, field) for field in
|
||||
field_names]
|
||||
writer.writerow(row)
|
||||
|
||||
return response
|
||||
|
||||
export_as_csv.short_description = "导出选中文章为CSV格式"
|
||||
|
||||
def export_as_json(self, request, queryset):
|
||||
"""导出选中的文章为JSON格式"""
|
||||
response = HttpResponse(content_type='application/json')
|
||||
response['Content-Disposition'] = 'attachment; filename=articles.json'
|
||||
|
||||
# 构造要导出的数据
|
||||
articles_data = []
|
||||
for article in queryset:
|
||||
articles_data.append({
|
||||
'id': article.id,
|
||||
'title': article.title,
|
||||
'website': article.website.name,
|
||||
'url': article.url,
|
||||
'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None,
|
||||
'content': article.content,
|
||||
'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
'media_files': article.media_files
|
||||
})
|
||||
|
||||
# 写入JSON数据
|
||||
response.write(json.dumps(articles_data, ensure_ascii=False, indent=2))
|
||||
return response
|
||||
|
||||
export_as_json.short_description = "导出选中文章为JSON格式"
|
||||
|
||||
def export_as_word(self, request, queryset):
|
||||
"""导出选中的文章为Word格式"""
|
||||
try:
|
||||
from docx import Document
|
||||
from io import BytesIO
|
||||
from docx.shared import Inches
|
||||
except ImportError:
|
||||
self.message_user(request, "缺少python-docx库,请安装: pip install python-docx", messages.ERROR)
|
||||
return
|
||||
|
||||
# 创建Word文档
|
||||
doc = Document()
|
||||
doc.add_heading('文章导出', 0)
|
||||
|
||||
for article in queryset:
|
||||
# 添加文章标题
|
||||
doc.add_heading(article.title, level=1)
|
||||
|
||||
# 添加文章元数据
|
||||
doc.add_paragraph(f"网站: {article.website.name}")
|
||||
doc.add_paragraph(f"URL: {article.url}")
|
||||
doc.add_paragraph(
|
||||
f"发布时间: {article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else 'N/A'}")
|
||||
doc.add_paragraph(f"创建时间: {article.created_at.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
|
||||
# 添加文章内容
|
||||
doc.add_heading('内容', level=2)
|
||||
# 简单处理HTML内容,移除标签并处理图片
|
||||
from bs4 import BeautifulSoup
|
||||
soup = BeautifulSoup(article.content, 'html.parser')
|
||||
|
||||
# 处理内容中的图片
|
||||
for img in soup.find_all('img'):
|
||||
src = img.get('src', '')
|
||||
if src:
|
||||
# 尝试添加图片到文档
|
||||
try:
|
||||
import os
|
||||
from django.conf import settings
|
||||
import requests
|
||||
from io import BytesIO
|
||||
|
||||
# 构建完整的图片路径
|
||||
if src.startswith('http'):
|
||||
# 网络图片
|
||||
response = requests.get(src, timeout=10)
|
||||
image_stream = BytesIO(response.content)
|
||||
doc.add_picture(image_stream, width=Inches(4.0))
|
||||
else:
|
||||
# 本地图片
|
||||
full_path = os.path.join(settings.MEDIA_ROOT, src.lstrip('/'))
|
||||
if os.path.exists(full_path):
|
||||
doc.add_picture(full_path, width=Inches(4.0))
|
||||
except Exception as e:
|
||||
# 如果添加图片失败,添加图片URL作为文本
|
||||
doc.add_paragraph(f"[图片: {src}]")
|
||||
|
||||
# 移除原始img标签
|
||||
img.decompose()
|
||||
|
||||
content_text = soup.get_text()
|
||||
doc.add_paragraph(content_text)
|
||||
|
||||
# 添加媒体文件信息
|
||||
if article.media_files:
|
||||
doc.add_heading('媒体文件', level=2)
|
||||
for media_file in article.media_files:
|
||||
try:
|
||||
import os
|
||||
from django.conf import settings
|
||||
from io import BytesIO
|
||||
import requests
|
||||
|
||||
full_path = os.path.join(settings.MEDIA_ROOT, media_file)
|
||||
if os.path.exists(full_path):
|
||||
# 添加图片到文档
|
||||
doc.add_picture(full_path, width=Inches(4.0))
|
||||
else:
|
||||
# 如果是URL格式的媒体文件
|
||||
if media_file.startswith('http'):
|
||||
response = requests.get(media_file, timeout=10)
|
||||
image_stream = BytesIO(response.content)
|
||||
doc.add_picture(image_stream, width=Inches(4.0))
|
||||
else:
|
||||
doc.add_paragraph(media_file)
|
||||
except Exception as e:
|
||||
doc.add_paragraph(media_file)
|
||||
|
||||
# 添加分页符
|
||||
doc.add_page_break()
|
||||
|
||||
# 保存到内存
|
||||
buffer = BytesIO()
|
||||
doc.save(buffer)
|
||||
buffer.seek(0)
|
||||
|
||||
# 创建HttpResponse
|
||||
from django.http import HttpResponse
|
||||
response = HttpResponse(buffer.getvalue(),
|
||||
content_type='application/vnd.openxmlformats-officedocument.wordprocessingml.document')
|
||||
response['Content-Disposition'] = 'attachment; filename=articles.docx'
|
||||
return response
|
||||
|
||||
export_as_word.short_description = "导出选中文章为Word格式"
|
||||
|
||||
|
||||
# 为不同网站创建专门的文章管理类
|
||||
class NewsCnArticleAdmin(admin.ModelAdmin):
|
||||
list_display = ('title', 'pub_date')
|
||||
search_fields = ('title', 'content')
|
||||
list_filter = ('pub_date',)
|
||||
actions = ['export_as_csv', 'export_as_json']
|
||||
|
||||
def get_queryset(self, request):
|
||||
qs = super().get_queryset(request)
|
||||
# 只显示新华网的文章
|
||||
return qs.filter(website__name='www.news.cn')
|
||||
|
||||
def export_as_csv(self, request, queryset):
|
||||
"""导出选中的文章为CSV格式"""
|
||||
meta = self.model._meta
|
||||
field_names = [field.name for field in meta.fields if field.name != 'content'] # 排除content字段以减小CSV大小
|
||||
|
||||
response = HttpResponse(content_type='text/csv')
|
||||
response['Content-Disposition'] = 'attachment; filename=news_cn_articles.csv'
|
||||
writer = csv.writer(response)
|
||||
|
||||
writer.writerow(field_names)
|
||||
for obj in queryset:
|
||||
row = []
|
||||
for field in field_names:
|
||||
value = getattr(obj, field)
|
||||
if callable(value):
|
||||
value = value()
|
||||
if field == 'website':
|
||||
value = value.name
|
||||
row.append(value)
|
||||
writer.writerow(row)
|
||||
|
||||
return response
|
||||
|
||||
export_as_csv.short_description = "导出选中文章为CSV格式"
|
||||
|
||||
def export_as_json(self, request, queryset):
|
||||
"""导出选中的文章为JSON格式"""
|
||||
response = HttpResponse(content_type='application/json')
|
||||
response['Content-Disposition'] = 'attachment; filename=news_cn_articles.json'
|
||||
|
||||
# 构造要导出的数据
|
||||
articles_data = []
|
||||
for article in queryset:
|
||||
articles_data.append({
|
||||
'id': article.id,
|
||||
'title': article.title,
|
||||
'website': article.website.name,
|
||||
'url': article.url,
|
||||
'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None,
|
||||
'content': article.content,
|
||||
'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
'media_files': article.media_files
|
||||
})
|
||||
|
||||
# 写入JSON数据
|
||||
response.write(json.dumps(articles_data, ensure_ascii=False, indent=2))
|
||||
return response
|
||||
|
||||
export_as_json.short_description = "导出选中文章为JSON格式"
|
||||
|
||||
|
||||
class DongfangyancaoArticleAdmin(admin.ModelAdmin):
|
||||
list_display = ('title', 'pub_date')
|
||||
search_fields = ('title', 'content')
|
||||
list_filter = ('pub_date',)
|
||||
# 添加动作选项
|
||||
actions = ['delete_selected_articles', 'delete_all_articles', 'export_as_csv', 'export_as_json']
|
||||
|
||||
def get_queryset(self, request):
|
||||
qs = super().get_queryset(request)
|
||||
# 只显示东方烟草报的文章
|
||||
return qs.filter(website__name='东方烟草报')
|
||||
|
||||
def delete_all_articles(self, request, queryset):
|
||||
"""删除当前筛选的所有文章(东方烟草报的所有文章)"""
|
||||
# 删除所有东方烟草报的文章
|
||||
deleted_count = self.get_queryset(request).delete()[0]
|
||||
self.message_user(request, f"成功删除 {deleted_count} 篇文章", messages.SUCCESS)
|
||||
|
||||
# 设置动作的显示名称
|
||||
delete_all_articles.short_description = "删除所有当前筛选的文章"
|
||||
|
||||
def export_as_csv(self, request, queryset):
|
||||
"""导出选中的文章为CSV格式"""
|
||||
meta = self.model._meta
|
||||
field_names = [field.name for field in meta.fields if field.name != 'content'] # 排除content字段以减小CSV大小
|
||||
|
||||
response = HttpResponse(content_type='text/csv')
|
||||
response['Content-Disposition'] = 'attachment; filename=dongfangyancao_articles.csv'
|
||||
writer = csv.writer(response)
|
||||
|
||||
writer.writerow(field_names)
|
||||
for obj in queryset:
|
||||
row = []
|
||||
for field in field_names:
|
||||
value = getattr(obj, field)
|
||||
if callable(value):
|
||||
value = value()
|
||||
if field == 'website':
|
||||
value = value.name
|
||||
row.append(value)
|
||||
writer.writerow(row)
|
||||
|
||||
return response
|
||||
|
||||
export_as_csv.short_description = "导出选中文章为CSV格式"
|
||||
|
||||
def export_as_json(self, request, queryset):
|
||||
"""导出选中的文章为JSON格式"""
|
||||
response = HttpResponse(content_type='application/json')
|
||||
response['Content-Disposition'] = 'attachment; filename=dongfangyancao_articles.json'
|
||||
|
||||
# 构造要导出的数据
|
||||
articles_data = []
|
||||
for article in queryset:
|
||||
articles_data.append({
|
||||
'id': article.id,
|
||||
'title': article.title,
|
||||
'website': article.website.name,
|
||||
'url': article.url,
|
||||
'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None,
|
||||
'content': article.content,
|
||||
'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
'media_files': article.media_files
|
||||
})
|
||||
|
||||
# 写入JSON数据
|
||||
response.write(json.dumps(articles_data, ensure_ascii=False, indent=2))
|
||||
return response
|
||||
|
||||
export_as_json.short_description = "导出选中文章为JSON格式"
|
||||
|
||||
|
||||
# 在各自的管理站点中注册模型
|
||||
news_cn_admin.register(Website, WebsiteAdmin)
|
||||
news_cn_admin.register(Article, NewsCnArticleAdmin)
|
||||
|
||||
dongfangyancao_admin.register(Website, WebsiteAdmin)
|
||||
dongfangyancao_admin.register(Article, DongfangyancaoArticleAdmin)
|
||||
947
core/admin_extended.py
Normal file
947
core/admin_extended.py
Normal file
@@ -0,0 +1,947 @@
|
||||
"""
|
||||
Django Admin扩展
|
||||
提供增强的管理界面功能
|
||||
"""
|
||||
|
||||
import logging
|
||||
from datetime import datetime, timedelta
|
||||
from django.contrib import admin
|
||||
from django.contrib.admin import SimpleListFilter
|
||||
from django.contrib.admin.utils import model_format_dict
|
||||
from django.contrib import messages
|
||||
from django.http import HttpResponseRedirect
|
||||
from django.urls import path, reverse
|
||||
from django.utils.html import format_html
|
||||
from django.utils import timezone
|
||||
from django.db.models import Count, Q
|
||||
from django.core.cache import cache
|
||||
|
||||
from .models import Website, Article, CrawlTask
|
||||
from .tasks import crawl_website, crawl_all_websites, cleanup_old_articles
|
||||
from .distributed_crawler import distributed_crawler
|
||||
from .task_executor import task_executor
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class WebsiteStatusFilter(SimpleListFilter):
|
||||
"""网站状态过滤器"""
|
||||
title = '网站状态'
|
||||
parameter_name = 'status'
|
||||
|
||||
def lookups(self, request, model_admin):
|
||||
return (
|
||||
('enabled', '已启用'),
|
||||
('disabled', '已禁用'),
|
||||
('no_articles', '无文章'),
|
||||
('recent_crawl', '最近爬取'),
|
||||
)
|
||||
|
||||
def queryset(self, request, queryset):
|
||||
if self.value() == 'enabled':
|
||||
return queryset.filter(enabled=True)
|
||||
elif self.value() == 'disabled':
|
||||
return queryset.filter(enabled=False)
|
||||
elif self.value() == 'no_articles':
|
||||
return queryset.annotate(article_count=Count('article')).filter(article_count=0)
|
||||
elif self.value() == 'recent_crawl':
|
||||
week_ago = timezone.now() - timedelta(days=7)
|
||||
return queryset.filter(last_crawl__gte=week_ago)
|
||||
return queryset
|
||||
|
||||
|
||||
class ArticleDateFilter(SimpleListFilter):
|
||||
"""文章日期过滤器"""
|
||||
title = '发布时间'
|
||||
parameter_name = 'date_range'
|
||||
|
||||
def lookups(self, request, model_admin):
|
||||
return (
|
||||
('today', '今天'),
|
||||
('week', '本周'),
|
||||
('month', '本月'),
|
||||
('quarter', '本季度'),
|
||||
)
|
||||
|
||||
def queryset(self, request, queryset):
|
||||
now = timezone.now()
|
||||
if self.value() == 'today':
|
||||
return queryset.filter(created_at__date=now.date())
|
||||
elif self.value() == 'week':
|
||||
week_start = now - timedelta(days=now.weekday())
|
||||
return queryset.filter(created_at__gte=week_start.replace(hour=0, minute=0, second=0))
|
||||
elif self.value() == 'month':
|
||||
return queryset.filter(created_at__year=now.year, created_at__month=now.month)
|
||||
elif self.value() == 'quarter':
|
||||
quarter = (now.month - 1) // 3
|
||||
quarter_start_month = quarter * 3 + 1
|
||||
return queryset.filter(
|
||||
created_at__year=now.year,
|
||||
created_at__month__gte=quarter_start_month,
|
||||
created_at__month__lt=quarter_start_month + 3
|
||||
)
|
||||
return queryset
|
||||
|
||||
|
||||
class WebsiteAdmin(admin.ModelAdmin):
|
||||
"""网站管理"""
|
||||
list_display = [
|
||||
'name', 'base_url', 'enabled', 'article_count',
|
||||
'last_crawl_display', 'status_indicator', 'actions_column'
|
||||
]
|
||||
list_filter = [WebsiteStatusFilter, 'enabled']
|
||||
search_fields = ['name', 'base_url']
|
||||
readonly_fields = ['article_count']
|
||||
actions = ['enable_websites', 'disable_websites', 'crawl_selected', 'crawl_all']
|
||||
|
||||
fieldsets = (
|
||||
('基本信息', {
|
||||
'fields': ('name', 'base_url', 'enabled')
|
||||
}),
|
||||
('统计信息', {
|
||||
'fields': ('article_count',),
|
||||
'classes': ('collapse',)
|
||||
}),
|
||||
('时间信息', {
|
||||
'fields': (),
|
||||
'classes': ('collapse',)
|
||||
}),
|
||||
)
|
||||
|
||||
# 添加get_websites方法以支持模板中的网站选择
|
||||
def get_websites(self, request):
|
||||
"""获取所有启用的网站,用于模板中的选择框"""
|
||||
return Website.objects.filter(enabled=True)
|
||||
|
||||
def article_count(self, obj):
|
||||
"""文章数量"""
|
||||
return obj.article_set.count()
|
||||
|
||||
article_count.short_description = '文章数量'
|
||||
|
||||
def last_crawl_display(self, obj):
|
||||
"""最后爬取时间显示"""
|
||||
return '未实现'
|
||||
|
||||
last_crawl_display.short_description = '最后爬取'
|
||||
|
||||
def status_indicator(self, obj):
|
||||
"""状态指示器"""
|
||||
if obj.enabled:
|
||||
return format_html('<span style="color: green;">●</span> 正常')
|
||||
else:
|
||||
return format_html('<span style="color: red;">●</span> 禁用')
|
||||
|
||||
status_indicator.short_description = '状态'
|
||||
|
||||
def actions_column(self, obj):
|
||||
"""操作列"""
|
||||
return format_html(
|
||||
'<a href="{}" class="button">爬取</a> '
|
||||
'<a href="{}" class="button">查看文章</a>',
|
||||
reverse('admin:crawl_website', args=[obj.id]),
|
||||
reverse('admin:core_article_changelist') + f'?website__id__exact={obj.id}'
|
||||
)
|
||||
|
||||
actions_column.short_description = '操作'
|
||||
|
||||
def enable_websites(self, request, queryset):
|
||||
"""启用选中的网站"""
|
||||
updated = queryset.update(enabled=True)
|
||||
self.message_user(request, f'成功启用 {updated} 个网站')
|
||||
|
||||
enable_websites.short_description = '启用选中的网站'
|
||||
|
||||
def disable_websites(self, request, queryset):
|
||||
"""禁用选中的网站"""
|
||||
updated = queryset.update(enabled=False)
|
||||
self.message_user(request, f'成功禁用 {updated} 个网站')
|
||||
|
||||
disable_websites.short_description = '禁用选中的网站'
|
||||
|
||||
def crawl_selected(self, request, queryset):
|
||||
"""爬取选中的网站"""
|
||||
for website in queryset:
|
||||
try:
|
||||
task = crawl_website.delay(website.id)
|
||||
self.message_user(
|
||||
request,
|
||||
f'网站 {website.name} 爬取任务已启动 (任务ID: {task.id})',
|
||||
messages.SUCCESS
|
||||
)
|
||||
except Exception as e:
|
||||
error_msg = str(e)
|
||||
if "[Errno 61] Connection refused" in error_msg:
|
||||
detailed_msg = "连接被拒绝,可能是Redis或其他依赖服务未启动。请检查以下几点:\n1. Redis服务是否运行 (尝试运行: redis-server)\n2. 如果使用Docker,请确保容器正在运行\n3. 检查Django配置中的CELERY_BROKER_URL设置\n4. 在本地开发环境中,可以运行 'python manage.py runserver' 和 'celery -A myproject worker -l info' 来启动必要的服务"
|
||||
else:
|
||||
detailed_msg = error_msg
|
||||
self.message_user(
|
||||
request,
|
||||
f'网站 {website.name} 爬取任务启动失败: {detailed_msg}',
|
||||
messages.ERROR
|
||||
)
|
||||
|
||||
crawl_selected.short_description = '爬取选中的网站'
|
||||
|
||||
def crawl_all(self, request, queryset):
|
||||
try:
|
||||
task = crawl_all_websites.delay()
|
||||
self.message_user(
|
||||
request,
|
||||
f'批量爬取任务已启动 (任务ID: {task.id})',
|
||||
messages.SUCCESS
|
||||
)
|
||||
except Exception as e:
|
||||
error_msg = str(e)
|
||||
if "[Errno 61] Connection refused" in error_msg:
|
||||
detailed_msg = "连接被拒绝,可能是Redis或其他依赖服务未启动。请检查以下几点:\n1. Redis服务是否运行 (尝试运行: redis-server)\n2. 如果使用Docker,请确保容器正在运行\n3. 检查Django配置中的CELERY_BROKER_URL设置\n4. 在本地开发环境中,可以运行 'python manage.py runserver' 和 'celery -A myproject worker -l info' 来启动必要的服务"
|
||||
else:
|
||||
detailed_msg = error_msg
|
||||
self.message_user(
|
||||
request,
|
||||
f'批量爬取任务启动失败: {detailed_msg}',
|
||||
messages.ERROR
|
||||
)
|
||||
|
||||
# crawl_all.short_description = '爬取所有网站'
|
||||
|
||||
def get_urls(self):
|
||||
"""添加自定义URL"""
|
||||
urls = super().get_urls()
|
||||
custom_urls = [
|
||||
path(
|
||||
'<int:website_id>/crawl/',
|
||||
self.admin_site.admin_view(self.crawl_website_view),
|
||||
name='crawl_website',
|
||||
),
|
||||
path(
|
||||
'run-crawler/',
|
||||
self.admin_site.admin_view(self.run_crawler_view),
|
||||
name='run_crawler',
|
||||
),
|
||||
]
|
||||
return custom_urls + urls
|
||||
|
||||
def crawl_website_view(self, request, website_id):
|
||||
"""爬取单个网站视图"""
|
||||
try:
|
||||
website = Website.objects.get(id=website_id)
|
||||
task = crawl_website.delay(website_id)
|
||||
self.message_user(
|
||||
request,
|
||||
f'网站 {website.name} 爬取任务已启动 (任务ID: {task.id})',
|
||||
messages.SUCCESS
|
||||
)
|
||||
except Website.DoesNotExist:
|
||||
self.message_user(request, '网站不存在', messages.ERROR)
|
||||
except Exception as e:
|
||||
error_msg = str(e)
|
||||
if "[Errno 61] Connection refused" in error_msg:
|
||||
detailed_msg = "连接被拒绝,可能是Redis或其他依赖服务未启动。请检查以下几点:\n1. Redis服务是否运行 (尝试运行: redis-server)\n2. 如果使用Docker,请确保容器正在运行\n3. 检查Django配置中的CELERY_BROKER_URL设置\n4. 在本地开发环境中,可以运行 'python manage.py runserver' 和 'celery -A myproject worker -l info' 来启动必要的服务"
|
||||
else:
|
||||
detailed_msg = error_msg
|
||||
self.message_user(request, f'爬取任务启动失败: {detailed_msg}', messages.ERROR)
|
||||
|
||||
return HttpResponseRedirect(reverse('admin:core_website_changelist'))
|
||||
|
||||
def run_crawler_view(self, request):
|
||||
"""运行爬虫视图"""
|
||||
try:
|
||||
task = crawl_all_websites.delay()
|
||||
self.message_user(
|
||||
request,
|
||||
f'批量爬取任务已启动 (任务ID: {task.id})',
|
||||
messages.SUCCESS
|
||||
)
|
||||
except Exception as e:
|
||||
error_msg = str(e)
|
||||
if "[Errno 61] Connection refused" in error_msg:
|
||||
detailed_msg = "连接被拒绝,可能是Redis或其他依赖服务未启动。请检查以下几点:\n1. Redis服务是否运行 (尝试运行: redis-server)\n2. 如果使用Docker,请确保容器正在运行\n3. 检查Django配置中的CELERY_BROKER_URL设置\n4. 在本地开发环境中,可以运行 'python manage.py runserver' 和 'celery -A myproject worker -l info' 来启动必要的服务"
|
||||
else:
|
||||
detailed_msg = error_msg
|
||||
self.message_user(
|
||||
request,
|
||||
f'批量爬取任务启动失败: {detailed_msg}',
|
||||
messages.ERROR
|
||||
)
|
||||
|
||||
return HttpResponseRedirect(reverse('admin:core_website_changelist'))
|
||||
|
||||
|
||||
class ArticleAdmin(admin.ModelAdmin):
|
||||
"""文章管理"""
|
||||
list_display = [
|
||||
'title', 'website', 'created_at',
|
||||
'media_count', 'actions_column'
|
||||
]
|
||||
list_filter = [
|
||||
ArticleDateFilter, 'website', 'created_at'
|
||||
]
|
||||
search_fields = ['title', 'content', 'url']
|
||||
readonly_fields = ['created_at', 'media_files_display']
|
||||
date_hierarchy = 'created_at'
|
||||
|
||||
fieldsets = (
|
||||
('基本信息', {
|
||||
'fields': ('title', 'url', 'website')
|
||||
}),
|
||||
('内容', {
|
||||
'fields': ('content',)
|
||||
}),
|
||||
('媒体文件', {
|
||||
'fields': ('media_files_display',),
|
||||
'classes': ('collapse',)
|
||||
}),
|
||||
('时间信息', {
|
||||
'fields': ('created_at',),
|
||||
'classes': ('collapse',)
|
||||
}),
|
||||
)
|
||||
|
||||
# 添加导出选中文章的操作
|
||||
actions = ['export_selected_articles']
|
||||
|
||||
def export_selected_articles(self, request, queryset):
|
||||
"""
|
||||
导出选中的文章为ZIP文件
|
||||
"""
|
||||
import zipfile
|
||||
from django.http import HttpResponse
|
||||
from io import BytesIO
|
||||
from django.conf import settings
|
||||
import os
|
||||
from bs4 import BeautifulSoup
|
||||
from docx import Document
|
||||
|
||||
# 创建内存中的ZIP文件
|
||||
zip_buffer = BytesIO()
|
||||
|
||||
with zipfile.ZipFile(zip_buffer, 'w') as zip_file:
|
||||
# 为每篇文章创建文件夹并添加内容
|
||||
for article in queryset:
|
||||
# 创建文章文件夹名称
|
||||
article_folder = f"article_{article.id}_{article.title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('*', '_').replace('?', '_').replace('"', '_').replace('<', '_').replace('>', '_').replace('|', '_')}"
|
||||
|
||||
# 创建Word文档
|
||||
doc = Document()
|
||||
doc.add_heading(article.title, 0)
|
||||
|
||||
# 添加文章信息
|
||||
doc.add_paragraph(f"网站: {article.website.name if article.website else ''}")
|
||||
doc.add_paragraph(f"URL: {article.url}")
|
||||
doc.add_paragraph(f"发布时间: {article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else ''}")
|
||||
doc.add_paragraph(f"创建时间: {article.created_at.strftime('%Y-%m-%d %H:%M:%S') if article.created_at else ''}")
|
||||
|
||||
# 添加内容标题
|
||||
doc.add_heading('内容:', level=1)
|
||||
|
||||
# 处理HTML内容
|
||||
soup = BeautifulSoup(article.content, 'html.parser')
|
||||
content_text = soup.get_text()
|
||||
doc.add_paragraph(content_text)
|
||||
|
||||
# 将Word文档保存到内存中
|
||||
doc_buffer = BytesIO()
|
||||
doc.save(doc_buffer)
|
||||
doc_buffer.seek(0)
|
||||
|
||||
# 将Word文档添加到ZIP文件
|
||||
zip_file.writestr(os.path.join(article_folder, f'{article.title.replace("/", "_")}.docx'), doc_buffer.getvalue())
|
||||
|
||||
# 添加媒体文件到ZIP包
|
||||
if article.media_files:
|
||||
for media_file in article.media_files:
|
||||
try:
|
||||
full_path = os.path.join(settings.MEDIA_ROOT, media_file)
|
||||
if os.path.exists(full_path):
|
||||
# 添加文件到ZIP包
|
||||
zip_file.write(full_path, os.path.join(article_folder, 'media', os.path.basename(media_file)))
|
||||
except Exception as e:
|
||||
# 如果添加媒体文件失败,继续处理其他文件
|
||||
pass
|
||||
|
||||
# 创建HttpResponse
|
||||
zip_buffer.seek(0)
|
||||
response = HttpResponse(zip_buffer.getvalue(), content_type='application/zip')
|
||||
response['Content-Disposition'] = 'attachment; filename=selected_articles.zip'
|
||||
|
||||
return response
|
||||
|
||||
export_selected_articles.short_description = "导出所选的文章为ZIP"
|
||||
|
||||
def content_preview(self, obj):
|
||||
"""内容预览"""
|
||||
return obj.content[:100] + '...' if len(obj.content) > 100 else obj.content
|
||||
|
||||
content_preview.short_description = '内容预览'
|
||||
|
||||
def media_count(self, obj):
|
||||
"""媒体文件数量"""
|
||||
if obj.media_files:
|
||||
return len(obj.media_files)
|
||||
return 0
|
||||
|
||||
media_count.short_description = '媒体文件'
|
||||
|
||||
def media_files_display(self, obj):
|
||||
"""媒体文件显示"""
|
||||
if not obj.media_files:
|
||||
return '无媒体文件'
|
||||
|
||||
html = '<div style="max-height: 300px; overflow-y: auto;">'
|
||||
for i, media in enumerate(obj.media_files):
|
||||
if media.get('type') == 'image':
|
||||
html += f'<div style="margin: 10px 0;"><img src="{media["url"]}" style="max-width: 200px; max-height: 150px;" /></div>'
|
||||
elif media.get('type') == 'video':
|
||||
html += f'<div style="margin: 10px 0;"><video controls style="max-width: 200px;"><source src="{media["url"]}" type="video/mp4"></video></div>'
|
||||
html += '</div>'
|
||||
return format_html(html)
|
||||
|
||||
media_files_display.short_description = '媒体文件'
|
||||
|
||||
def actions_column(self, obj):
|
||||
"""操作列"""
|
||||
# 修改: 添加跳转到本地文章详情页的链接
|
||||
return format_html(
|
||||
'<a href="{}" target="_blank" class="button">查看原文</a> '
|
||||
'<a href="{}" target="_blank" class="button">本地查看</a>',
|
||||
obj.url,
|
||||
reverse('article_detail', args=[obj.id])
|
||||
)
|
||||
|
||||
actions_column.short_description = '操作'
|
||||
|
||||
|
||||
class CrawlTaskStatusFilter(SimpleListFilter):
|
||||
"""爬取任务状态过滤器"""
|
||||
title = '任务状态'
|
||||
parameter_name = 'status'
|
||||
|
||||
def lookups(self, request, model_admin):
|
||||
return (
|
||||
('pending', '等待中'),
|
||||
('running', '运行中'),
|
||||
('completed', '已完成'),
|
||||
('failed', '失败'),
|
||||
('cancelled', '已取消'),
|
||||
)
|
||||
|
||||
def queryset(self, request, queryset):
|
||||
if self.value():
|
||||
return queryset.filter(status=self.value())
|
||||
return queryset
|
||||
|
||||
|
||||
class CrawlTaskTypeFilter(SimpleListFilter):
|
||||
"""爬取任务类型过滤器"""
|
||||
title = '任务类型'
|
||||
parameter_name = 'task_type'
|
||||
|
||||
def lookups(self, request, model_admin):
|
||||
return (
|
||||
('keyword', '关键词搜索'),
|
||||
('historical', '历史文章'),
|
||||
('full_site', '全站爬取'),
|
||||
)
|
||||
|
||||
def queryset(self, request, queryset):
|
||||
if self.value():
|
||||
return queryset.filter(task_type=self.value())
|
||||
return queryset
|
||||
|
||||
|
||||
class CrawlTaskAdmin(admin.ModelAdmin):
|
||||
"""爬取任务管理"""
|
||||
list_display = [
|
||||
'name', 'task_type', 'keyword', 'websites_display', 'status',
|
||||
'progress_display', 'created_at', 'duration_display', 'actions_column'
|
||||
]
|
||||
list_filter = [CrawlTaskStatusFilter, CrawlTaskTypeFilter, 'created_at']
|
||||
search_fields = ['name', 'keyword', 'created_by']
|
||||
readonly_fields = [
|
||||
'status', 'progress', 'current_website', 'current_action',
|
||||
'total_articles', 'success_count', 'failed_count',
|
||||
'created_at', 'started_at', 'completed_at', 'error_message',
|
||||
'result_details', 'duration_display', 'progress_display',
|
||||
'execution_count', 'last_execution_at', 'execution_summary'
|
||||
]
|
||||
actions = ['start_tasks', 'rerun_tasks', 'cancel_tasks', 'delete_completed_tasks']
|
||||
|
||||
class Media:
|
||||
js = ('admin/js/crawl_task_actions.js',)
|
||||
|
||||
fieldsets = (
|
||||
('基本信息', {
|
||||
'fields': ('name', 'task_type', 'keyword')
|
||||
}),
|
||||
('爬取配置', {
|
||||
'fields': ('websites', 'start_date', 'end_date', 'max_pages', 'max_articles')
|
||||
}),
|
||||
('任务状态', {
|
||||
'fields': ('status', 'progress_display', 'current_website', 'current_action'),
|
||||
'classes': ('collapse',)
|
||||
}),
|
||||
('统计信息', {
|
||||
'fields': ('total_articles', 'success_count', 'failed_count'),
|
||||
'classes': ('collapse',)
|
||||
}),
|
||||
('时间信息', {
|
||||
'fields': ('created_at', 'started_at', 'completed_at', 'duration_display'),
|
||||
'classes': ('collapse',)
|
||||
}),
|
||||
('执行历史', {
|
||||
'fields': ('execution_count', 'last_execution_at', 'execution_summary'),
|
||||
'classes': ('collapse',)
|
||||
}),
|
||||
('错误信息', {
|
||||
'fields': ('error_message',),
|
||||
'classes': ('collapse',)
|
||||
}),
|
||||
('结果详情', {
|
||||
'fields': ('result_details',),
|
||||
'classes': ('collapse',)
|
||||
}),
|
||||
)
|
||||
|
||||
def websites_display(self, obj):
|
||||
"""网站列表显示"""
|
||||
return obj.get_websites_display()
|
||||
websites_display.short_description = '目标网站'
|
||||
|
||||
def progress_display(self, obj):
|
||||
"""进度显示"""
|
||||
if obj.status == 'running':
|
||||
return format_html(
|
||||
'<div style="width: 100px; background-color: #f0f0f0; border-radius: 3px;">'
|
||||
'<div style="width: {}%; background-color: #4CAF50; height: 20px; border-radius: 3px; text-align: center; color: white; line-height: 20px;">{}%</div>'
|
||||
'</div>',
|
||||
obj.progress, obj.progress
|
||||
)
|
||||
elif obj.status == 'completed':
|
||||
return format_html('<span style="color: green;">✓ 完成</span>')
|
||||
elif obj.status == 'failed':
|
||||
return format_html('<span style="color: red;">✗ 失败</span>')
|
||||
elif obj.status == 'cancelled':
|
||||
return format_html('<span style="color: orange;">⊘ 已取消</span>')
|
||||
else:
|
||||
return format_html('<span style="color: gray;">⏳ 等待</span>')
|
||||
progress_display.short_description = '进度'
|
||||
|
||||
def duration_display(self, obj):
|
||||
"""执行时长显示"""
|
||||
duration = obj.get_duration()
|
||||
if duration:
|
||||
total_seconds = int(duration.total_seconds())
|
||||
hours = total_seconds // 3600
|
||||
minutes = (total_seconds % 3600) // 60
|
||||
seconds = total_seconds % 60
|
||||
if hours > 0:
|
||||
return f"{hours}小时{minutes}分钟"
|
||||
elif minutes > 0:
|
||||
return f"{minutes}分钟{seconds}秒"
|
||||
else:
|
||||
return f"{seconds}秒"
|
||||
return "-"
|
||||
duration_display.short_description = '执行时长'
|
||||
|
||||
def execution_summary(self, obj):
|
||||
"""执行摘要显示"""
|
||||
return obj.get_execution_summary()
|
||||
execution_summary.short_description = '执行摘要'
|
||||
|
||||
def actions_column(self, obj):
|
||||
"""操作列"""
|
||||
actions = []
|
||||
|
||||
if obj.status == 'pending':
|
||||
actions.append(f'<a href="javascript:void(0)" onclick="startTask({obj.id})" class="button">开始</a>')
|
||||
|
||||
if obj.can_cancel():
|
||||
actions.append(f'<a href="javascript:void(0)" onclick="cancelTask({obj.id})" class="button">取消</a>')
|
||||
|
||||
if obj.status == 'completed':
|
||||
actions.append(f'<a href="javascript:void(0)" onclick="viewResults({obj.id})" class="button">查看结果</a>')
|
||||
actions.append(f'<a href="javascript:void(0)" onclick="rerunTask({obj.id})" class="button" style="background-color: #28a745;">重新执行</a>')
|
||||
|
||||
if obj.status in ['failed', 'cancelled']:
|
||||
actions.append(f'<a href="javascript:void(0)" onclick="rerunTask({obj.id})" class="button" style="background-color: #28a745;">重新执行</a>')
|
||||
|
||||
return format_html(' '.join(actions))
|
||||
actions_column.short_description = '操作'
|
||||
|
||||
def start_tasks(self, request, queryset):
|
||||
"""启动选中的任务"""
|
||||
started_count = 0
|
||||
for task in queryset.filter(status='pending'):
|
||||
try:
|
||||
success, message = task_executor.start_task(task.id)
|
||||
if success:
|
||||
started_count += 1
|
||||
else:
|
||||
self.message_user(request, f'启动任务 {task.name} 失败: {message}', messages.ERROR)
|
||||
except Exception as e:
|
||||
self.message_user(request, f'启动任务 {task.name} 失败: {e}', messages.ERROR)
|
||||
|
||||
if started_count > 0:
|
||||
self.message_user(request, f'成功启动 {started_count} 个任务', messages.SUCCESS)
|
||||
start_tasks.short_description = '启动选中的任务'
|
||||
|
||||
def rerun_tasks(self, request, queryset):
|
||||
"""重新执行选中的任务"""
|
||||
rerun_count = 0
|
||||
for task in queryset.filter(status__in=['completed', 'failed', 'cancelled']):
|
||||
try:
|
||||
success, message = task_executor.rerun_task(task.id)
|
||||
if success:
|
||||
rerun_count += 1
|
||||
else:
|
||||
self.message_user(request, f'重新执行任务 {task.name} 失败: {message}', messages.ERROR)
|
||||
except Exception as e:
|
||||
self.message_user(request, f'重新执行任务 {task.name} 失败: {e}', messages.ERROR)
|
||||
|
||||
if rerun_count > 0:
|
||||
self.message_user(request, f'成功重新执行 {rerun_count} 个任务', messages.SUCCESS)
|
||||
rerun_tasks.short_description = '重新执行选中的任务'
|
||||
|
||||
def cancel_tasks(self, request, queryset):
|
||||
"""取消选中的任务"""
|
||||
cancelled_count = 0
|
||||
for task in queryset.filter(status__in=['pending', 'running']):
|
||||
try:
|
||||
success, message = task_executor.cancel_task(task.id)
|
||||
if success:
|
||||
cancelled_count += 1
|
||||
else:
|
||||
self.message_user(request, f'取消任务 {task.name} 失败: {message}', messages.ERROR)
|
||||
except Exception as e:
|
||||
self.message_user(request, f'取消任务 {task.name} 失败: {e}', messages.ERROR)
|
||||
|
||||
if cancelled_count > 0:
|
||||
self.message_user(request, f'成功取消 {cancelled_count} 个任务', messages.SUCCESS)
|
||||
elif queryset.filter(status__in=['pending', 'running']).count() > 0:
|
||||
# 有任务但没有成功取消任何任务
|
||||
self.message_user(request, '没有成功取消任何任务', messages.WARNING)
|
||||
cancel_tasks.short_description = '取消选中的任务'
|
||||
|
||||
def delete_completed_tasks(self, request, queryset):
|
||||
"""删除已完成的任务"""
|
||||
completed_tasks = queryset.filter(status__in=['completed', 'failed', 'cancelled'])
|
||||
count = completed_tasks.count()
|
||||
completed_tasks.delete()
|
||||
|
||||
if count > 0:
|
||||
self.message_user(request, f'成功删除 {count} 个已完成的任务', messages.SUCCESS)
|
||||
delete_completed_tasks.short_description = '删除已完成的任务'
|
||||
|
||||
def get_urls(self):
|
||||
"""添加自定义URL"""
|
||||
urls = super().get_urls()
|
||||
custom_urls = [
|
||||
path(
|
||||
'create-keyword-task/',
|
||||
self.admin_site.admin_view(self.create_keyword_task_view),
|
||||
name='create_keyword_task',
|
||||
),
|
||||
path(
|
||||
'create-historical-task/',
|
||||
self.admin_site.admin_view(self.create_historical_task_view),
|
||||
name='create_historical_task',
|
||||
),
|
||||
path(
|
||||
'create-full-site-task/',
|
||||
self.admin_site.admin_view(self.create_full_site_task_view),
|
||||
name='create_full_site_task',
|
||||
),
|
||||
path(
|
||||
'<int:task_id>/start/',
|
||||
self.admin_site.admin_view(self.start_task_view),
|
||||
name='start_task',
|
||||
),
|
||||
path(
|
||||
'<int:task_id>/cancel/',
|
||||
self.admin_site.admin_view(self.cancel_task_view),
|
||||
name='cancel_task',
|
||||
),
|
||||
path(
|
||||
'<int:task_id>/rerun/',
|
||||
self.admin_site.admin_view(self.rerun_task_view),
|
||||
name='rerun_task',
|
||||
),
|
||||
path(
|
||||
'<int:task_id>/results/',
|
||||
self.admin_site.admin_view(self.view_results_view),
|
||||
name='view_results',
|
||||
),
|
||||
]
|
||||
return custom_urls + urls
|
||||
|
||||
def create_keyword_task_view(self, request):
|
||||
"""创建关键词搜索任务视图"""
|
||||
if request.method == 'POST':
|
||||
try:
|
||||
from .utils import WEBSITE_CRAWL_CONFIGS
|
||||
|
||||
name = request.POST.get('name', '')
|
||||
keyword = request.POST.get('keyword', '')
|
||||
websites = request.POST.getlist('websites')
|
||||
start_date = request.POST.get('start_date')
|
||||
end_date = request.POST.get('end_date')
|
||||
max_pages = int(request.POST.get('max_pages', 10))
|
||||
max_articles = int(request.POST.get('max_articles', 100))
|
||||
|
||||
if not name or not keyword:
|
||||
self.message_user(request, '任务名称和关键词不能为空', messages.ERROR)
|
||||
return HttpResponseRedirect(reverse('admin:core_crawltask_changelist'))
|
||||
|
||||
# 创建任务
|
||||
task = CrawlTask.objects.create(
|
||||
name=name,
|
||||
task_type='keyword',
|
||||
keyword=keyword,
|
||||
start_date=start_date if start_date else None,
|
||||
end_date=end_date if end_date else None,
|
||||
max_pages=max_pages,
|
||||
max_articles=max_articles,
|
||||
created_by=request.user.username if request.user.is_authenticated else 'admin'
|
||||
)
|
||||
|
||||
# 添加选择的网站
|
||||
if websites:
|
||||
website_objects = Website.objects.filter(name__in=websites)
|
||||
task.websites.set(website_objects)
|
||||
|
||||
self.message_user(request, f'关键词搜索任务 "{name}" 创建成功', messages.SUCCESS)
|
||||
return HttpResponseRedirect(reverse('admin:core_crawltask_change', args=[task.id]))
|
||||
|
||||
except Exception as e:
|
||||
self.message_user(request, f'创建任务失败: {e}', messages.ERROR)
|
||||
|
||||
# GET请求,显示创建表单
|
||||
context = {
|
||||
'websites': Website.objects.filter(enabled=True),
|
||||
'title': '创建关键词搜索任务'
|
||||
}
|
||||
return admin.site.admin_view(self.render_create_task_template)(request, 'admin/create_keyword_task.html', context)
|
||||
|
||||
def create_historical_task_view(self, request):
|
||||
"""创建历史文章任务视图"""
|
||||
if request.method == 'POST':
|
||||
try:
|
||||
from .utils import WEBSITE_CRAWL_CONFIGS
|
||||
|
||||
name = request.POST.get('name', '')
|
||||
websites = request.POST.getlist('websites')
|
||||
start_date = request.POST.get('start_date')
|
||||
end_date = request.POST.get('end_date')
|
||||
max_articles = int(request.POST.get('max_articles', 50))
|
||||
|
||||
if not name:
|
||||
self.message_user(request, '任务名称不能为空', messages.ERROR)
|
||||
return HttpResponseRedirect(reverse('admin:core_crawltask_changelist'))
|
||||
|
||||
# 创建任务
|
||||
task = CrawlTask.objects.create(
|
||||
name=name,
|
||||
task_type='historical',
|
||||
keyword='历史文章',
|
||||
start_date=start_date if start_date else None,
|
||||
end_date=end_date if end_date else None,
|
||||
max_articles=max_articles,
|
||||
created_by=request.user.username if request.user.is_authenticated else 'admin'
|
||||
)
|
||||
|
||||
# 添加选择的网站
|
||||
if websites:
|
||||
website_objects = Website.objects.filter(name__in=websites)
|
||||
task.websites.set(website_objects)
|
||||
|
||||
self.message_user(request, f'历史文章任务 "{name}" 创建成功', messages.SUCCESS)
|
||||
return HttpResponseRedirect(reverse('admin:core_crawltask_change', args=[task.id]))
|
||||
|
||||
except Exception as e:
|
||||
self.message_user(request, f'创建任务失败: {e}', messages.ERROR)
|
||||
|
||||
# GET请求,显示创建表单
|
||||
context = {
|
||||
'websites': Website.objects.filter(enabled=True),
|
||||
'title': '创建历史文章任务'
|
||||
}
|
||||
return admin.site.admin_view(self.render_create_task_template)(request, 'admin/create_historical_task.html', context)
|
||||
|
||||
def create_full_site_task_view(self, request):
|
||||
"""创建全站爬取任务视图"""
|
||||
if request.method == 'POST':
|
||||
try:
|
||||
from .utils import WEBSITE_CRAWL_CONFIGS
|
||||
|
||||
name = request.POST.get('name', '')
|
||||
websites = request.POST.getlist('websites')
|
||||
max_pages = int(request.POST.get('max_pages', 500))
|
||||
|
||||
if not name:
|
||||
self.message_user(request, '任务名称不能为空', messages.ERROR)
|
||||
return HttpResponseRedirect(reverse('admin:core_crawltask_changelist'))
|
||||
|
||||
# 创建任务
|
||||
task = CrawlTask.objects.create(
|
||||
name=name,
|
||||
task_type='full_site',
|
||||
keyword='全站爬取',
|
||||
max_pages=max_pages,
|
||||
created_by=request.user.username if request.user.is_authenticated else 'admin'
|
||||
)
|
||||
|
||||
# 添加选择的网站
|
||||
if websites:
|
||||
website_objects = Website.objects.filter(name__in=websites)
|
||||
task.websites.set(website_objects)
|
||||
|
||||
self.message_user(request, f'全站爬取任务 "{name}" 创建成功', messages.SUCCESS)
|
||||
return HttpResponseRedirect(reverse('admin:core_crawltask_change', args=[task.id]))
|
||||
|
||||
except Exception as e:
|
||||
self.message_user(request, f'创建任务失败: {e}', messages.ERROR)
|
||||
|
||||
# GET请求,显示创建表单
|
||||
context = {
|
||||
'websites': Website.objects.filter(enabled=True),
|
||||
'title': '创建全站爬取任务'
|
||||
}
|
||||
return admin.site.admin_view(self.render_create_task_template)(request, 'admin/create_full_site_task.html', context)
|
||||
|
||||
def start_task_view(self, request, task_id):
|
||||
"""启动任务视图"""
|
||||
try:
|
||||
success, message = task_executor.start_task(task_id)
|
||||
if success:
|
||||
self.message_user(request, f'任务已启动: {message}', messages.SUCCESS)
|
||||
else:
|
||||
self.message_user(request, f'启动任务失败: {message}', messages.ERROR)
|
||||
except Exception as e:
|
||||
self.message_user(request, f'启动任务失败: {e}', messages.ERROR)
|
||||
|
||||
return HttpResponseRedirect(reverse('admin:core_crawltask_changelist'))
|
||||
|
||||
def rerun_task_view(self, request, task_id):
|
||||
"""重新执行任务视图"""
|
||||
try:
|
||||
success, message = task_executor.rerun_task(task_id)
|
||||
if success:
|
||||
self.message_user(request, f'任务已重新执行: {message}', messages.SUCCESS)
|
||||
else:
|
||||
self.message_user(request, f'重新执行任务失败: {message}', messages.ERROR)
|
||||
except Exception as e:
|
||||
self.message_user(request, f'重新执行任务失败: {e}', messages.ERROR)
|
||||
|
||||
return HttpResponseRedirect(reverse('admin:core_crawltask_changelist'))
|
||||
|
||||
def cancel_task_view(self, request, task_id):
|
||||
"""取消任务视图"""
|
||||
try:
|
||||
success, message = task_executor.cancel_task(task_id)
|
||||
if success:
|
||||
self.message_user(request, f'任务已取消: {message}', messages.SUCCESS)
|
||||
else:
|
||||
self.message_user(request, f'取消任务失败: {message}', messages.ERROR)
|
||||
except Exception as e:
|
||||
self.message_user(request, f'取消任务失败: {e}', messages.ERROR)
|
||||
|
||||
return HttpResponseRedirect(reverse('admin:core_crawltask_changelist'))
|
||||
|
||||
def view_results_view(self, request, task_id):
|
||||
"""查看结果视图"""
|
||||
try:
|
||||
task = CrawlTask.objects.get(id=task_id)
|
||||
context = {
|
||||
'task': task,
|
||||
'title': f'任务结果 - {task.name}'
|
||||
}
|
||||
return admin.site.admin_view(self.render_create_task_template)(request, 'admin/task_results.html', context)
|
||||
except CrawlTask.DoesNotExist:
|
||||
self.message_user(request, '任务不存在', messages.ERROR)
|
||||
return HttpResponseRedirect(reverse('admin:core_crawltask_changelist'))
|
||||
|
||||
def render_create_task_template(self, request, template_name, context):
|
||||
"""渲染创建任务模板"""
|
||||
from django.template.loader import render_to_string
|
||||
from django.http import HttpResponse
|
||||
|
||||
context.update({
|
||||
'site_header': admin.site.site_header,
|
||||
'site_title': admin.site.site_title,
|
||||
'has_permission': True,
|
||||
'user': request.user,
|
||||
})
|
||||
|
||||
html = render_to_string(template_name, context)
|
||||
return HttpResponse(html)
|
||||
|
||||
|
||||
#class CrawlerStatusAdmin(admin.ModelAdmin):
|
||||
# """爬虫状态管理"""
|
||||
# change_list_template = 'admin/crawler_status.html'
|
||||
#
|
||||
# def changelist_view(self, request, extra_context=None):
|
||||
# """爬虫状态视图"""
|
||||
# # 获取分布式爬虫状态
|
||||
# nodes = distributed_crawler.get_available_nodes()
|
||||
# node_statuses = []
|
||||
#
|
||||
# for node_id in nodes:
|
||||
# status = distributed_crawler.get_node_status(node_id)
|
||||
# node_statuses.append(status)
|
||||
#
|
||||
# # 获取最近的批次
|
||||
# batches = distributed_crawler.get_all_batches()[:10]
|
||||
#
|
||||
# # 获取任务统计
|
||||
# task_stats = {
|
||||
# 'active_tasks': len([n for n in node_statuses if n['active_tasks'] > 0]),
|
||||
# 'total_nodes': len(nodes),
|
||||
# 'total_batches': len(batches),
|
||||
# }
|
||||
#
|
||||
# extra_context = extra_context or {}
|
||||
# extra_context.update({
|
||||
# 'nodes': node_statuses,
|
||||
# 'batches': batches,
|
||||
# 'task_stats': task_stats,
|
||||
# })
|
||||
#
|
||||
# return super().changelist_view(request, extra_context)
|
||||
#
|
||||
|
||||
# 注册管理类
|
||||
admin.site.register(Website, WebsiteAdmin)
|
||||
admin.site.register(Article, ArticleAdmin)
|
||||
admin.site.register(CrawlTask, CrawlTaskAdmin)
|
||||
|
||||
|
||||
# 隐藏Celery Results管理功能
|
||||
# 禁用django_celery_results应用的自动注册
|
||||
try:
|
||||
from django_celery_results.models import TaskResult, GroupResult
|
||||
from django_celery_results.admin import TaskResultAdmin, GroupResultAdmin
|
||||
admin.site.unregister(TaskResult)
|
||||
admin.site.unregister(GroupResult)
|
||||
except:
|
||||
pass
|
||||
|
||||
# 隐藏Celery Beat周期任务管理功能
|
||||
# 禁用django_celery_beat应用的自动注册
|
||||
try:
|
||||
from django_celery_beat.models import PeriodicTask, ClockedSchedule, CrontabSchedule, SolarSchedule, IntervalSchedule
|
||||
admin.site.unregister(PeriodicTask)
|
||||
admin.site.unregister(ClockedSchedule)
|
||||
admin.site.unregister(CrontabSchedule)
|
||||
admin.site.unregister(SolarSchedule)
|
||||
admin.site.unregister(IntervalSchedule)
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
|
||||
# 自定义管理站点标题
|
||||
admin.site.site_header = 'Green Classroom 管理系统'
|
||||
admin.site.site_title = 'Green Classroom'
|
||||
admin.site.index_title = '欢迎使用 Green Classroom 管理系统'
|
||||
746
core/api.py
Normal file
746
core/api.py
Normal file
@@ -0,0 +1,746 @@
|
||||
"""
|
||||
RESTful API模块
|
||||
提供完整的API接口,支持爬虫管理、数据查询、任务控制
|
||||
"""
|
||||
|
||||
import logging
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict, List, Any
|
||||
import json
|
||||
import csv
|
||||
import io
|
||||
import zipfile
|
||||
|
||||
from django.http import JsonResponse, HttpResponse
|
||||
from django.views.decorators.csrf import csrf_exempt
|
||||
from django.views.decorators.http import require_http_methods
|
||||
from django.core.paginator import Paginator
|
||||
from django.db.models import Q, Count
|
||||
from django.utils import timezone
|
||||
# 添加DRF相关导入
|
||||
from rest_framework.views import APIView
|
||||
from rest_framework.response import Response
|
||||
from rest_framework.permissions import IsAuthenticated
|
||||
from rest_framework.authentication import SessionAuthentication, TokenAuthentication
|
||||
|
||||
# 添加python-docx库支持
|
||||
from docx import Document
|
||||
|
||||
# 添加BeautifulSoup导入
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from .models import Website, Article
|
||||
from .tasks import crawl_website, cleanup_old_articles
|
||||
from .distributed_crawler import distributed_crawler
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def api_response(data=None, message="", status=200, error=None):
|
||||
"""统一的API响应格式"""
|
||||
response = {
|
||||
"success": status < 400,
|
||||
"message": message,
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
}
|
||||
|
||||
if data is not None:
|
||||
response["data"] = data
|
||||
|
||||
if error:
|
||||
response["error"] = error
|
||||
|
||||
# 如果是DRF视图,则返回DRF Response
|
||||
if hasattr(api_response, '_use_drf_response') and api_response._use_drf_response:
|
||||
return Response(response, status=status)
|
||||
|
||||
return JsonResponse(response, status=status)
|
||||
|
||||
|
||||
# 修改健康检查接口为DRF类视图
|
||||
class HealthView(APIView):
|
||||
"""健康检查接口"""
|
||||
permission_classes = [] # 允许无认证访问
|
||||
authentication_classes = []
|
||||
|
||||
def get(self, request):
|
||||
try:
|
||||
# 检查数据库连接
|
||||
website_count = Website.objects.count()
|
||||
article_count = Article.objects.count()
|
||||
|
||||
# 检查Redis连接
|
||||
from django.core.cache import cache
|
||||
cache.set('health_check', 'ok', 60)
|
||||
cache_result = cache.get('health_check')
|
||||
|
||||
health_data = {
|
||||
"status": "healthy",
|
||||
"database": "ok",
|
||||
"redis": "ok" if cache_result == 'ok' else 'error',
|
||||
"website_count": website_count,
|
||||
"article_count": article_count,
|
||||
"uptime": "running"
|
||||
}
|
||||
|
||||
# 设置使用DRF响应
|
||||
api_response._use_drf_response = True
|
||||
return api_response(data=health_data, message="服务运行正常")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"健康检查失败: {e}")
|
||||
return api_response(
|
||||
data={"status": "unhealthy", "error": str(e)},
|
||||
message="服务异常",
|
||||
status=500,
|
||||
error=str(e)
|
||||
)
|
||||
finally:
|
||||
api_response._use_drf_response = False
|
||||
|
||||
|
||||
# 修改网站列表接口为DRF类视图
|
||||
class WebsitesView(APIView):
|
||||
"""获取网站列表"""
|
||||
permission_classes = [IsAuthenticated]
|
||||
authentication_classes = [SessionAuthentication, TokenAuthentication]
|
||||
|
||||
def get(self, request):
|
||||
try:
|
||||
# 分页参数
|
||||
page = int(request.GET.get('page', 1))
|
||||
page_size = int(request.GET.get('page_size', 20))
|
||||
search = request.GET.get('search', '')
|
||||
enabled = request.GET.get('enabled', '')
|
||||
|
||||
# 构建查询
|
||||
queryset = Website.objects.all()
|
||||
|
||||
if search:
|
||||
queryset = queryset.filter(
|
||||
Q(name__icontains=search) |
|
||||
Q(base_url__icontains=search)
|
||||
)
|
||||
|
||||
if enabled in ['true', 'false']:
|
||||
queryset = queryset.filter(enabled=enabled == 'true')
|
||||
|
||||
# 排序 - 使用id字段替代不存在的created_at字段
|
||||
queryset = queryset.order_by('-id')
|
||||
|
||||
# 分页
|
||||
paginator = Paginator(queryset, page_size)
|
||||
websites_page = paginator.get_page(page)
|
||||
|
||||
# 统计数据
|
||||
stats = {
|
||||
'total_websites': Website.objects.count(),
|
||||
'enabled_websites': Website.objects.filter(enabled=True).count(),
|
||||
'disabled_websites': Website.objects.filter(enabled=False).count(),
|
||||
}
|
||||
|
||||
# 序列化数据
|
||||
websites_data = []
|
||||
for website in websites_page:
|
||||
website_data = {
|
||||
'id': website.id,
|
||||
'name': website.name,
|
||||
'base_url': website.base_url,
|
||||
'enabled': website.enabled,
|
||||
# 移除不存在的created_at和updated_at字段
|
||||
'article_count': website.article_set.count(),
|
||||
'last_crawl': website.last_crawl.isoformat() if getattr(website, 'last_crawl', None) else None,
|
||||
}
|
||||
websites_data.append(website_data)
|
||||
|
||||
response_data = {
|
||||
'websites': websites_data,
|
||||
'pagination': {
|
||||
'page': page,
|
||||
'page_size': page_size,
|
||||
'total_pages': paginator.num_pages,
|
||||
'total_count': paginator.count,
|
||||
'has_next': websites_page.has_next(),
|
||||
'has_previous': websites_page.has_previous(),
|
||||
},
|
||||
'stats': stats
|
||||
}
|
||||
|
||||
# 设置使用DRF响应
|
||||
api_response._use_drf_response = True
|
||||
return api_response(data=response_data, message="获取网站列表成功")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"获取网站列表失败: {e}")
|
||||
return api_response(message="获取网站列表失败", status=500, error=str(e))
|
||||
finally:
|
||||
api_response._use_drf_response = False
|
||||
|
||||
|
||||
@csrf_exempt
|
||||
@require_http_methods(["GET"])
|
||||
def api_website_detail(request, website_id):
|
||||
"""获取网站详情"""
|
||||
try:
|
||||
website = Website.objects.get(id=website_id)
|
||||
|
||||
# 获取最近的文章
|
||||
recent_articles = website.article_set.order_by('-created_at')[:10]
|
||||
|
||||
website_data = {
|
||||
'id': website.id,
|
||||
'name': website.name,
|
||||
'base_url': website.base_url,
|
||||
'enabled': website.enabled,
|
||||
'created_at': website.created_at.isoformat(),
|
||||
'updated_at': website.updated_at.isoformat(),
|
||||
'last_crawl': website.last_crawl.isoformat() if website.last_crawl else None,
|
||||
'article_count': website.article_set.count(),
|
||||
'recent_articles': [
|
||||
{
|
||||
'id': article.id,
|
||||
'title': article.title,
|
||||
'url': article.url,
|
||||
'created_at': article.created_at.isoformat(),
|
||||
}
|
||||
for article in recent_articles
|
||||
]
|
||||
}
|
||||
|
||||
return api_response(data=website_data, message="获取网站详情成功")
|
||||
|
||||
except Website.DoesNotExist:
|
||||
return api_response(message="网站不存在", status=404, error="Website not found")
|
||||
except Exception as e:
|
||||
logger.error(f"获取网站详情失败: {e}")
|
||||
return api_response(message="获取网站详情失败", status=500, error=str(e))
|
||||
|
||||
|
||||
@csrf_exempt
|
||||
@require_http_methods(["POST"])
|
||||
def api_crawl_website(request, website_id):
|
||||
"""爬取指定网站"""
|
||||
try:
|
||||
website = Website.objects.get(id=website_id)
|
||||
|
||||
# 启动爬虫任务
|
||||
task = crawl_website.delay(website_id)
|
||||
|
||||
response_data = {
|
||||
'task_id': task.id,
|
||||
'website_id': website_id,
|
||||
'website_name': website.name,
|
||||
'status': 'started'
|
||||
}
|
||||
|
||||
return api_response(data=response_data, message="爬虫任务已启动")
|
||||
|
||||
except Website.DoesNotExist:
|
||||
return api_response(message="网站不存在", status=404, error="Website not found")
|
||||
except Exception as e:
|
||||
logger.error(f"启动爬虫任务失败: {e}")
|
||||
return api_response(message="启动爬虫任务失败", status=500, error=str(e))
|
||||
|
||||
|
||||
@csrf_exempt
|
||||
@require_http_methods(["GET"])
|
||||
def api_articles(request):
|
||||
"""获取文章列表"""
|
||||
try:
|
||||
# 分页参数
|
||||
page = int(request.GET.get('page', 1))
|
||||
page_size = int(request.GET.get('page_size', 20))
|
||||
search = request.GET.get('search', '')
|
||||
website_id = request.GET.get('website_id', '')
|
||||
date_from = request.GET.get('date_from', '')
|
||||
date_to = request.GET.get('date_to', '')
|
||||
|
||||
# 构建查询
|
||||
queryset = Article.objects.select_related('website').all()
|
||||
|
||||
if search:
|
||||
queryset = queryset.filter(
|
||||
Q(title__icontains=search) |
|
||||
Q(content__icontains=search)
|
||||
)
|
||||
|
||||
if website_id:
|
||||
queryset = queryset.filter(website_id=website_id)
|
||||
|
||||
if date_from:
|
||||
try:
|
||||
date_from_obj = datetime.fromisoformat(date_from.replace('Z', '+00:00'))
|
||||
queryset = queryset.filter(created_at__gte=date_from_obj)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
if date_to:
|
||||
try:
|
||||
date_to_obj = datetime.fromisoformat(date_to.replace('Z', '+00:00'))
|
||||
queryset = queryset.filter(created_at__lte=date_to_obj)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# 排序
|
||||
queryset = queryset.order_by('-created_at')
|
||||
|
||||
# 分页
|
||||
paginator = Paginator(queryset, page_size)
|
||||
articles_page = paginator.get_page(page)
|
||||
|
||||
# 统计数据
|
||||
stats = {
|
||||
'total_articles': Article.objects.count(),
|
||||
'today_articles': Article.objects.filter(
|
||||
created_at__date=timezone.now().date()
|
||||
).count(),
|
||||
'week_articles': Article.objects.filter(
|
||||
created_at__gte=timezone.now() - timedelta(days=7)
|
||||
).count(),
|
||||
}
|
||||
|
||||
# 序列化数据
|
||||
articles_data = []
|
||||
for article in articles_page:
|
||||
article_data = {
|
||||
'id': article.id,
|
||||
'title': article.title,
|
||||
'url': article.url,
|
||||
'content': article.content[:200] + '...' if len(article.content) > 200 else article.content,
|
||||
'created_at': article.created_at.isoformat(),
|
||||
'website': {
|
||||
'id': article.website.id,
|
||||
'name': article.website.name,
|
||||
},
|
||||
'media_files': article.media_files,
|
||||
}
|
||||
articles_data.append(article_data)
|
||||
|
||||
response_data = {
|
||||
'articles': articles_data,
|
||||
'pagination': {
|
||||
'page': page,
|
||||
'page_size': page_size,
|
||||
'total_pages': paginator.num_pages,
|
||||
'total_count': paginator.count,
|
||||
'has_next': articles_page.has_next(),
|
||||
'has_previous': articles_page.has_previous(),
|
||||
},
|
||||
'stats': stats
|
||||
}
|
||||
|
||||
return api_response(data=response_data, message="获取文章列表成功")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"获取文章列表失败: {e}")
|
||||
return api_response(message="获取文章列表失败", status=500, error=str(e))
|
||||
|
||||
|
||||
@csrf_exempt
|
||||
@require_http_methods(["GET"])
|
||||
def api_article_detail(request, article_id):
|
||||
"""获取文章详情"""
|
||||
try:
|
||||
article = Article.objects.select_related('website').get(id=article_id)
|
||||
|
||||
article_data = {
|
||||
'id': article.id,
|
||||
'title': article.title,
|
||||
'url': article.url,
|
||||
'content': article.content,
|
||||
'created_at': article.created_at.isoformat(),
|
||||
'website': {
|
||||
'id': article.website.id,
|
||||
'name': article.website.name,
|
||||
'base_url': article.website.base_url,
|
||||
},
|
||||
'media_files': article.media_files,
|
||||
}
|
||||
|
||||
return api_response(data=article_data, message="获取文章详情成功")
|
||||
|
||||
except Article.DoesNotExist:
|
||||
return api_response(message="文章不存在", status=404, error="Article not found")
|
||||
except Exception as e:
|
||||
logger.error(f"获取文章详情失败: {e}")
|
||||
return api_response(message="获取文章详情失败", status=500, error=str(e))
|
||||
|
||||
|
||||
@csrf_exempt
|
||||
@require_http_methods(["GET"])
|
||||
def api_crawler_status(request):
|
||||
"""获取爬虫状态"""
|
||||
try:
|
||||
# 获取分布式爬虫状态
|
||||
nodes = distributed_crawler.get_available_nodes()
|
||||
node_statuses = []
|
||||
|
||||
for node_id in nodes:
|
||||
status = distributed_crawler.get_node_status(node_id)
|
||||
node_statuses.append(status)
|
||||
|
||||
# 获取最近的批次
|
||||
batches = distributed_crawler.get_all_batches()[:10]
|
||||
|
||||
# 获取任务统计
|
||||
task_stats = {
|
||||
'active_tasks': len([n for n in node_statuses if n['active_tasks'] > 0]),
|
||||
'total_nodes': len(nodes),
|
||||
'total_batches': len(batches),
|
||||
}
|
||||
|
||||
response_data = {
|
||||
'nodes': node_statuses,
|
||||
'batches': batches,
|
||||
'stats': task_stats,
|
||||
}
|
||||
|
||||
return api_response(data=response_data, message="获取爬虫状态成功")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"获取爬虫状态失败: {e}")
|
||||
return api_response(message="获取爬虫状态失败", status=500, error=str(e))
|
||||
|
||||
|
||||
@csrf_exempt
|
||||
@require_http_methods(["POST"])
|
||||
def api_start_distributed_crawl(request):
|
||||
"""启动分布式爬取"""
|
||||
try:
|
||||
data = json.loads(request.body)
|
||||
website_ids = data.get('website_ids', [])
|
||||
|
||||
if not website_ids:
|
||||
return api_response(message="请选择要爬取的网站", status=400, error="No websites selected")
|
||||
|
||||
# 启动分布式爬取
|
||||
batch_id = distributed_crawler.distribute_crawl_tasks(website_ids)
|
||||
|
||||
if batch_id in ['no_websites', 'no_available_nodes']:
|
||||
return api_response(message="无法启动分布式爬取", status=400, error=batch_id)
|
||||
|
||||
response_data = {
|
||||
'batch_id': batch_id,
|
||||
'website_ids': website_ids,
|
||||
'status': 'started'
|
||||
}
|
||||
|
||||
return api_response(data=response_data, message="分布式爬取已启动")
|
||||
|
||||
except json.JSONDecodeError:
|
||||
return api_response(message="请求数据格式错误", status=400, error="Invalid JSON")
|
||||
except Exception as e:
|
||||
logger.error(f"启动分布式爬取失败: {e}")
|
||||
return api_response(message="启动分布式爬取失败", status=500, error=str(e))
|
||||
|
||||
|
||||
@csrf_exempt
|
||||
@require_http_methods(["GET"])
|
||||
def api_batch_status(request, batch_id):
|
||||
"""获取批次状态"""
|
||||
try:
|
||||
batch_status = distributed_crawler.get_batch_status(batch_id)
|
||||
|
||||
if batch_status.get('status') == 'not_found':
|
||||
return api_response(message="批次不存在", status=404, error="Batch not found")
|
||||
|
||||
return api_response(data=batch_status, message="获取批次状态成功")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"获取批次状态失败: {e}")
|
||||
return api_response(message="获取批次状态失败", status=500, error=str(e))
|
||||
|
||||
|
||||
@csrf_exempt
|
||||
@require_http_methods(["GET", "POST"])
|
||||
def api_cleanup_articles(request):
|
||||
"""清理旧文章"""
|
||||
# 如果是GET请求,返回清理功能的描述信息
|
||||
if request.method == "GET":
|
||||
response_data = {
|
||||
'description': '文章清理API',
|
||||
'method': 'POST',
|
||||
'parameters': {
|
||||
'days': '保留天数,默认30天'
|
||||
},
|
||||
'example': {
|
||||
'days': 30
|
||||
}
|
||||
}
|
||||
return api_response(data=response_data, message="API使用说明")
|
||||
|
||||
try:
|
||||
data = json.loads(request.body)
|
||||
days = data.get('days', 30)
|
||||
|
||||
# 启动清理任务
|
||||
task = cleanup_old_articles.delay(days)
|
||||
|
||||
response_data = {
|
||||
'task_id': task.id,
|
||||
'days': days,
|
||||
'status': 'started'
|
||||
}
|
||||
|
||||
return api_response(data=response_data, message="清理任务已启动")
|
||||
|
||||
except json.JSONDecodeError:
|
||||
return api_response(message="请求数据格式错误", status=400, error="Invalid JSON")
|
||||
except Exception as e:
|
||||
logger.error(f"启动清理任务失败: {e}")
|
||||
return api_response(message="启动清理任务失败", status=500, error=str(e))
|
||||
|
||||
|
||||
@csrf_exempt
|
||||
@require_http_methods(["GET"])
|
||||
def api_stats(request):
|
||||
"""获取统计信息"""
|
||||
try:
|
||||
# 基础统计
|
||||
total_websites = Website.objects.count()
|
||||
total_articles = Article.objects.count()
|
||||
enabled_websites = Website.objects.filter(enabled=True).count()
|
||||
|
||||
# 时间统计
|
||||
today = timezone.now().date()
|
||||
week_ago = timezone.now() - timedelta(days=7)
|
||||
month_ago = timezone.now() - timedelta(days=30)
|
||||
|
||||
today_articles = Article.objects.filter(created_at__date=today).count()
|
||||
week_articles = Article.objects.filter(created_at__gte=week_ago).count()
|
||||
month_articles = Article.objects.filter(created_at__gte=month_ago).count()
|
||||
|
||||
# 网站统计
|
||||
website_stats = []
|
||||
for website in Website.objects.all():
|
||||
website_stats.append({
|
||||
'id': website.id,
|
||||
'name': website.name,
|
||||
'article_count': website.article_set.count(),
|
||||
# 使用getattr安全访问last_crawl属性,如果不存在则返回None
|
||||
'last_crawl': website.last_crawl.isoformat() if getattr(website, 'last_crawl', None) else None,
|
||||
})
|
||||
|
||||
# 分布式爬虫统计
|
||||
nodes = distributed_crawler.get_available_nodes()
|
||||
batches = distributed_crawler.get_all_batches()
|
||||
|
||||
response_data = {
|
||||
'overview': {
|
||||
'total_websites': total_websites,
|
||||
'enabled_websites': enabled_websites,
|
||||
'total_articles': total_articles,
|
||||
'today_articles': today_articles,
|
||||
'week_articles': week_articles,
|
||||
'month_articles': month_articles,
|
||||
},
|
||||
'websites': website_stats,
|
||||
'crawler': {
|
||||
'active_nodes': len(nodes),
|
||||
'total_batches': len(batches),
|
||||
'recent_batches': batches[:5],
|
||||
}
|
||||
}
|
||||
|
||||
return api_response(data=response_data, message="获取统计信息成功")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"获取统计信息失败: {e}")
|
||||
return api_response(message="获取统计信息失败", status=500, error=str(e))
|
||||
|
||||
|
||||
@csrf_exempt
|
||||
@require_http_methods(["POST"])
|
||||
def export_articles(request):
|
||||
"""导出文章"""
|
||||
try:
|
||||
data = json.loads(request.body)
|
||||
article_ids = data.get('article_ids', [])
|
||||
export_format = data.get('format', 'docx') # 默认改为docx格式
|
||||
|
||||
if not article_ids:
|
||||
return api_response(message="请选择要导出的文章", status=400, error="No articles selected")
|
||||
|
||||
# 获取文章数据
|
||||
articles = Article.objects.filter(id__in=article_ids).select_related('website')
|
||||
|
||||
if not articles.exists():
|
||||
return api_response(message="未找到指定的文章", status=404, error="Articles not found")
|
||||
|
||||
import os # 添加导入
|
||||
from django.conf import settings # 添加导入
|
||||
|
||||
if export_format == 'json':
|
||||
# 导出为JSON格式
|
||||
articles_data = []
|
||||
for article in articles:
|
||||
articles_data.append({
|
||||
'id': article.id,
|
||||
'title': article.title,
|
||||
'url': article.url,
|
||||
'content': article.content,
|
||||
'created_at': article.created_at.isoformat(),
|
||||
'website': {
|
||||
'id': article.website.id,
|
||||
'name': article.website.name,
|
||||
},
|
||||
'media_files': article.media_files,
|
||||
})
|
||||
|
||||
response = HttpResponse(
|
||||
json.dumps(articles_data, ensure_ascii=False, indent=2),
|
||||
content_type='application/json'
|
||||
)
|
||||
response['Content-Disposition'] = 'attachment; filename="articles.json"'
|
||||
return response
|
||||
|
||||
elif export_format == 'csv':
|
||||
# 导出为CSV格式
|
||||
output = io.StringIO()
|
||||
writer = csv.writer(output)
|
||||
writer.writerow(['ID', '标题', '网址', '内容', '创建时间', '网站'])
|
||||
|
||||
for article in articles:
|
||||
writer.writerow([
|
||||
article.id,
|
||||
article.title,
|
||||
article.url,
|
||||
article.content[:1000] + '...' if len(article.content) > 1000 else article.content,
|
||||
article.created_at.isoformat(),
|
||||
article.website.name
|
||||
])
|
||||
|
||||
response = HttpResponse(output.getvalue(), content_type='text/csv')
|
||||
response['Content-Disposition'] = 'attachment; filename="articles.csv"'
|
||||
return response
|
||||
|
||||
elif export_format == 'docx':
|
||||
# 导出为Word格式,每个文章一个文件夹
|
||||
zip_buffer = io.BytesIO()
|
||||
with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
|
||||
for article in articles:
|
||||
# 创建文章文件夹名称
|
||||
safe_title = "".join(c for c in article.title if c.isalnum() or c in (' ','_','-')).rstrip()
|
||||
folder_name = f"article_{article.id}_{safe_title}"[:50]
|
||||
|
||||
# 创建Word文档
|
||||
doc = Document()
|
||||
doc.add_heading(article.title, 0)
|
||||
|
||||
# 添加文章信息
|
||||
doc.add_paragraph(f"网站: {article.website.name}")
|
||||
doc.add_paragraph(f"网址: {article.url}")
|
||||
doc.add_paragraph(f"发布时间: {article.pub_date.isoformat() if article.pub_date else 'N/A'}")
|
||||
doc.add_paragraph(f"创建时间: {article.created_at.isoformat()}")
|
||||
|
||||
# 添加内容标题
|
||||
doc.add_heading('内容:', level=1)
|
||||
|
||||
# 处理HTML内容
|
||||
content_text = BeautifulSoup(article.content, 'html.parser').get_text()
|
||||
doc.add_paragraph(content_text)
|
||||
|
||||
# 将文档保存到内存中
|
||||
doc_buffer = io.BytesIO()
|
||||
doc.save(doc_buffer)
|
||||
doc_buffer.seek(0)
|
||||
|
||||
# 添加到ZIP文件
|
||||
zip_file.writestr(f"{folder_name}/article.docx", doc_buffer.getvalue())
|
||||
|
||||
# 添加媒体文件(如果存在)
|
||||
if article.media_files:
|
||||
for media in article.media_files:
|
||||
try:
|
||||
# 如果是本地文件路径
|
||||
if not media.startswith('http'):
|
||||
media_path = os.path.join(settings.MEDIA_ROOT, media.lstrip('/'))
|
||||
if os.path.exists(media_path):
|
||||
zip_file.write(media_path, f"{folder_name}/media/{os.path.basename(media_path)}")
|
||||
# 如果是URL格式的媒体文件
|
||||
else:
|
||||
import requests
|
||||
from io import BytesIO
|
||||
|
||||
response = requests.get(media, timeout=10)
|
||||
if response.status_code == 200:
|
||||
image_stream = BytesIO(response.content)
|
||||
media_filename = f"{folder_name}/media/{os.path.basename(media)}"
|
||||
zip_file.writestr(media_filename, image_stream.getvalue())
|
||||
except Exception:
|
||||
# 忽略无法添加的媒体文件
|
||||
pass
|
||||
|
||||
response = HttpResponse(zip_buffer.getvalue(), content_type='application/zip')
|
||||
response['Content-Disposition'] = 'attachment; filename="articles.zip"'
|
||||
return response
|
||||
|
||||
elif export_format == 'zip':
|
||||
# 导出为ZIP包,每个文章一个文件夹
|
||||
zip_buffer = io.BytesIO()
|
||||
with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
|
||||
for article in articles:
|
||||
# 创建文章文件夹名称
|
||||
safe_title = "".join(c for c in article.title if c.isalnum() or c in (' ','_','-')).rstrip()
|
||||
folder_name = f"article_{article.id}_{safe_title}"[:50]
|
||||
|
||||
# 创建Word文档
|
||||
doc = Document()
|
||||
doc.add_heading(article.title, 0)
|
||||
|
||||
# 添加文章信息
|
||||
doc.add_paragraph(f"网站: {article.website.name}")
|
||||
doc.add_paragraph(f"网址: {article.url}")
|
||||
doc.add_paragraph(f"发布时间: {article.pub_date.isoformat() if article.pub_date else 'N/A'}")
|
||||
doc.add_paragraph(f"创建时间: {article.created_at.isoformat()}")
|
||||
|
||||
# 添加内容标题
|
||||
doc.add_heading('内容:', level=1)
|
||||
|
||||
# 处理HTML内容
|
||||
content_text = BeautifulSoup(article.content, 'html.parser').get_text()
|
||||
doc.add_paragraph(content_text)
|
||||
|
||||
# 将文档保存到内存中
|
||||
doc_buffer = io.BytesIO()
|
||||
doc.save(doc_buffer)
|
||||
doc_buffer.seek(0)
|
||||
|
||||
# 添加到ZIP文件
|
||||
zip_file.writestr(f"{folder_name}/article.docx", doc_buffer.getvalue())
|
||||
|
||||
# 添加媒体文件(如果存在)
|
||||
if article.media_files:
|
||||
for media in article.media_files:
|
||||
try:
|
||||
# 如果是本地文件路径
|
||||
if not media.startswith('http'):
|
||||
media_path = os.path.join(settings.MEDIA_ROOT, media.lstrip('/'))
|
||||
if os.path.exists(media_path):
|
||||
zip_file.write(media_path, f"{folder_name}/media/{os.path.basename(media_path)}")
|
||||
# 如果是URL格式的媒体文件
|
||||
else:
|
||||
import requests
|
||||
from io import BytesIO
|
||||
|
||||
response = requests.get(media, timeout=10)
|
||||
if response.status_code == 200:
|
||||
image_stream = BytesIO(response.content)
|
||||
media_filename = f"{folder_name}/media/{os.path.basename(media)}"
|
||||
zip_file.writestr(media_filename, image_stream.getvalue())
|
||||
except Exception:
|
||||
# 忽略无法添加的媒体文件
|
||||
pass
|
||||
|
||||
response = HttpResponse(zip_buffer.getvalue(), content_type='application/zip')
|
||||
response['Content-Disposition'] = 'attachment; filename="articles.zip"'
|
||||
return response
|
||||
|
||||
else:
|
||||
return api_response(message="不支持的导出格式", status=400, error="Unsupported format")
|
||||
|
||||
except json.JSONDecodeError:
|
||||
return api_response(message="请求数据格式错误", status=400, error="Invalid JSON")
|
||||
except Exception as e:
|
||||
logger.error(f"导出文章失败: {e}")
|
||||
return api_response(message="导出文章失败", status=500, error=str(e))
|
||||
@@ -4,3 +4,8 @@ from django.apps import AppConfig
|
||||
class CoreConfig(AppConfig):
|
||||
default_auto_field = 'django.db.models.BigAutoField'
|
||||
name = 'core'
|
||||
|
||||
def ready(self):
|
||||
"""应用启动时执行"""
|
||||
# 导入Admin扩展
|
||||
import core.admin_extended
|
||||
|
||||
276
core/distributed_crawler.py
Normal file
276
core/distributed_crawler.py
Normal file
@@ -0,0 +1,276 @@
|
||||
"""
|
||||
分布式爬虫模块
|
||||
支持多节点爬虫集群,任务分发和结果聚合
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
from typing import Dict, List, Optional, Any
|
||||
from celery import group, chain
|
||||
from django.conf import settings
|
||||
from django.core.cache import cache
|
||||
from django.db import transaction
|
||||
from .models import Website, Article
|
||||
from .tasks import crawl_website, crawl_all_websites
|
||||
from .utils import full_site_crawler
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DistributedCrawler:
|
||||
"""分布式爬虫管理器"""
|
||||
|
||||
def __init__(self):
|
||||
self.cache_prefix = "crawler:distributed:"
|
||||
self.task_timeout = getattr(settings, 'CRAWLER_TASK_TIMEOUT', 1800) # 30分钟
|
||||
|
||||
def get_node_status(self, node_id: str) -> Dict[str, Any]:
|
||||
"""获取节点状态"""
|
||||
cache_key = f"{self.cache_prefix}node:{node_id}:status"
|
||||
status = cache.get(cache_key, {})
|
||||
return {
|
||||
'node_id': node_id,
|
||||
'status': status.get('status', 'unknown'),
|
||||
'last_heartbeat': status.get('last_heartbeat'),
|
||||
'active_tasks': status.get('active_tasks', 0),
|
||||
'completed_tasks': status.get('completed_tasks', 0),
|
||||
'failed_tasks': status.get('failed_tasks', 0),
|
||||
}
|
||||
|
||||
def register_node(self, node_id: str, capacity: int = 10) -> bool:
|
||||
"""注册爬虫节点"""
|
||||
cache_key = f"{self.cache_prefix}node:{node_id}:status"
|
||||
status = {
|
||||
'status': 'active',
|
||||
'capacity': capacity,
|
||||
'active_tasks': 0,
|
||||
'completed_tasks': 0,
|
||||
'failed_tasks': 0,
|
||||
'last_heartbeat': time.time(),
|
||||
'registered_at': time.time(),
|
||||
}
|
||||
cache.set(cache_key, status, timeout=3600) # 1小时过期
|
||||
|
||||
# 添加到节点列表
|
||||
nodes_key = f"{self.cache_prefix}active_nodes"
|
||||
nodes = cache.get(nodes_key, [])
|
||||
if node_id not in nodes:
|
||||
nodes.append(node_id)
|
||||
cache.set(nodes_key, nodes, timeout=3600)
|
||||
|
||||
logger.info(f"注册爬虫节点: {node_id}, 容量: {capacity}")
|
||||
return True
|
||||
|
||||
def unregister_node(self, node_id: str) -> bool:
|
||||
"""注销爬虫节点"""
|
||||
cache_key = f"{self.cache_prefix}node:{node_id}:status"
|
||||
cache.delete(cache_key)
|
||||
|
||||
# 从节点列表移除
|
||||
nodes_key = f"{self.cache_prefix}active_nodes"
|
||||
nodes = cache.get(nodes_key, [])
|
||||
if node_id in nodes:
|
||||
nodes.remove(node_id)
|
||||
cache.set(nodes_key, nodes, timeout=3600)
|
||||
|
||||
logger.info(f"注销爬虫节点: {node_id}")
|
||||
return True
|
||||
|
||||
def heartbeat(self, node_id: str, active_tasks: int = 0) -> bool:
|
||||
"""节点心跳"""
|
||||
cache_key = f"{self.cache_prefix}node:{node_id}:status"
|
||||
status = cache.get(cache_key, {})
|
||||
if status:
|
||||
status['last_heartbeat'] = time.time()
|
||||
status['active_tasks'] = active_tasks
|
||||
cache.set(cache_key, status, timeout=3600)
|
||||
return True
|
||||
|
||||
def get_available_nodes(self) -> List[str]:
|
||||
"""获取可用节点列表"""
|
||||
nodes_key = f"{self.cache_prefix}active_nodes"
|
||||
nodes = cache.get(nodes_key, [])
|
||||
available_nodes = []
|
||||
|
||||
for node_id in nodes:
|
||||
status = self.get_node_status(node_id)
|
||||
if status['status'] == 'active':
|
||||
# 检查心跳是否在5分钟内
|
||||
if status['last_heartbeat'] and (time.time() - status['last_heartbeat']) < 300:
|
||||
available_nodes.append(node_id)
|
||||
|
||||
return available_nodes
|
||||
|
||||
def distribute_crawl_tasks(self, websites: List[int], max_concurrent: int = 5) -> str:
|
||||
"""分发爬虫任务到多个节点"""
|
||||
if not websites:
|
||||
return "no_websites"
|
||||
|
||||
available_nodes = self.get_available_nodes()
|
||||
if not available_nodes:
|
||||
logger.warning("没有可用的爬虫节点")
|
||||
return "no_available_nodes"
|
||||
|
||||
# 创建任务批次
|
||||
batch_id = f"batch_{int(time.time())}"
|
||||
batch_key = f"{self.cache_prefix}batch:{batch_id}"
|
||||
|
||||
# 将网站分组分配给不同节点
|
||||
tasks = []
|
||||
for i, website_id in enumerate(websites):
|
||||
node_id = available_nodes[i % len(available_nodes)]
|
||||
task = crawl_website.apply_async(
|
||||
args=[website_id],
|
||||
kwargs={'node_id': node_id, 'batch_id': batch_id},
|
||||
countdown=i * 2 # 错开启动时间
|
||||
)
|
||||
tasks.append(task)
|
||||
|
||||
# 保存批次信息
|
||||
batch_info = {
|
||||
'batch_id': batch_id,
|
||||
'websites': websites,
|
||||
'tasks': [task.id for task in tasks],
|
||||
'nodes': available_nodes,
|
||||
'status': 'running',
|
||||
'created_at': time.time(),
|
||||
'total_tasks': len(tasks),
|
||||
'completed_tasks': 0,
|
||||
'failed_tasks': 0,
|
||||
}
|
||||
cache.set(batch_key, batch_info, timeout=7200) # 2小时过期
|
||||
|
||||
logger.info(f"创建分布式爬虫批次: {batch_id}, 任务数: {len(tasks)}, 节点数: {len(available_nodes)}")
|
||||
return batch_id
|
||||
|
||||
def get_batch_status(self, batch_id: str) -> Dict[str, Any]:
|
||||
"""获取批次状态"""
|
||||
batch_key = f"{self.cache_prefix}batch:{batch_id}"
|
||||
batch_info = cache.get(batch_key, {})
|
||||
|
||||
if not batch_info:
|
||||
return {'status': 'not_found'}
|
||||
|
||||
# 统计任务状态
|
||||
completed = 0
|
||||
failed = 0
|
||||
running = 0
|
||||
|
||||
for task_id in batch_info.get('tasks', []):
|
||||
task_result = cache.get(f"{self.cache_prefix}task:{task_id}")
|
||||
if task_result:
|
||||
if task_result.get('status') == 'completed':
|
||||
completed += 1
|
||||
elif task_result.get('status') == 'failed':
|
||||
failed += 1
|
||||
else:
|
||||
running += 1
|
||||
|
||||
batch_info.update({
|
||||
'completed_tasks': completed,
|
||||
'failed_tasks': failed,
|
||||
'running_tasks': running,
|
||||
'progress': (completed + failed) / batch_info.get('total_tasks', 1) * 100
|
||||
})
|
||||
|
||||
# 检查是否完成
|
||||
if completed + failed >= batch_info.get('total_tasks', 0):
|
||||
batch_info['status'] = 'completed'
|
||||
|
||||
cache.set(batch_key, batch_info, timeout=7200)
|
||||
return batch_info
|
||||
|
||||
def get_all_batches(self) -> List[Dict[str, Any]]:
|
||||
"""获取所有批次"""
|
||||
pattern = f"{self.cache_prefix}batch:*"
|
||||
batches = []
|
||||
|
||||
# 这里简化实现,实际应该使用Redis的SCAN命令
|
||||
for i in range(100): # 假设最多100个批次
|
||||
batch_key = f"{self.cache_prefix}batch:batch_{i}"
|
||||
batch_info = cache.get(batch_key)
|
||||
if batch_info:
|
||||
batches.append(batch_info)
|
||||
|
||||
return sorted(batches, key=lambda x: x.get('created_at', 0), reverse=True)
|
||||
|
||||
def cleanup_old_batches(self, max_age_hours: int = 24) -> int:
|
||||
"""清理旧的批次数据"""
|
||||
cutoff_time = time.time() - (max_age_hours * 3600)
|
||||
cleaned = 0
|
||||
|
||||
for i in range(100):
|
||||
batch_key = f"{self.cache_prefix}batch:batch_{i}"
|
||||
batch_info = cache.get(batch_key)
|
||||
if batch_info and batch_info.get('created_at', 0) < cutoff_time:
|
||||
cache.delete(batch_key)
|
||||
cleaned += 1
|
||||
|
||||
logger.info(f"清理了 {cleaned} 个旧批次")
|
||||
return cleaned
|
||||
|
||||
|
||||
class CrawlerNode:
|
||||
"""爬虫节点"""
|
||||
|
||||
def __init__(self, node_id: str, capacity: int = 10):
|
||||
self.node_id = node_id
|
||||
self.capacity = capacity
|
||||
self.distributed_crawler = DistributedCrawler()
|
||||
self.active_tasks = 0
|
||||
|
||||
def start(self):
|
||||
"""启动节点"""
|
||||
self.distributed_crawler.register_node(self.node_id, self.capacity)
|
||||
logger.info(f"爬虫节点 {self.node_id} 已启动")
|
||||
|
||||
def stop(self):
|
||||
"""停止节点"""
|
||||
self.distributed_crawler.unregister_node(self.node_id)
|
||||
logger.info(f"爬虫节点 {self.node_id} 已停止")
|
||||
|
||||
def heartbeat(self):
|
||||
"""发送心跳"""
|
||||
self.distributed_crawler.heartbeat(self.node_id, self.active_tasks)
|
||||
|
||||
def process_task(self, website_id: int, batch_id: str = None) -> Dict[str, Any]:
|
||||
"""处理爬虫任务"""
|
||||
self.active_tasks += 1
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
# 执行爬虫任务
|
||||
website = Website.objects.get(id=website_id)
|
||||
result = full_site_crawler(website.base_url, website, max_pages=100)
|
||||
|
||||
# 记录任务结果
|
||||
task_result = {
|
||||
'status': 'completed',
|
||||
'website_id': website_id,
|
||||
'website_name': website.name,
|
||||
'result': result,
|
||||
'duration': time.time() - start_time,
|
||||
'completed_at': time.time(),
|
||||
}
|
||||
|
||||
logger.info(f"节点 {self.node_id} 完成网站 {website.name} 爬取")
|
||||
|
||||
except Exception as e:
|
||||
task_result = {
|
||||
'status': 'failed',
|
||||
'website_id': website_id,
|
||||
'error': str(e),
|
||||
'duration': time.time() - start_time,
|
||||
'failed_at': time.time(),
|
||||
}
|
||||
logger.error(f"节点 {self.node_id} 爬取网站 {website_id} 失败: {e}")
|
||||
|
||||
finally:
|
||||
self.active_tasks -= 1
|
||||
|
||||
return task_result
|
||||
|
||||
|
||||
# 全局分布式爬虫实例
|
||||
distributed_crawler = DistributedCrawler()
|
||||
765
core/keyword_crawler.py
Normal file
765
core/keyword_crawler.py
Normal file
@@ -0,0 +1,765 @@
|
||||
"""
|
||||
关键词爬虫引擎
|
||||
基于 crawler_engine.py 的关键词爬取方法改进
|
||||
"""
|
||||
|
||||
import requests
|
||||
import time
|
||||
import re
|
||||
import logging
|
||||
import os
|
||||
import urllib3
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urljoin, urlparse
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
from django.core.files.base import ContentFile
|
||||
from requests.adapters import HTTPAdapter
|
||||
from urllib3.util.retry import Retry
|
||||
from .models import Website, CrawlTask, Article
|
||||
from .utils import get_page_with_selenium, get_page_with_requests, check_keyword_in_content
|
||||
|
||||
# 禁用SSL警告
|
||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
|
||||
# 设置日志记录器
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class KeywordCrawler:
|
||||
"""关键词爬虫引擎"""
|
||||
|
||||
def __init__(self, task_id, task_executor_instance=None):
|
||||
self.task = CrawlTask.objects.get(id=task_id)
|
||||
self.task_id = task_id
|
||||
self.task_executor = task_executor_instance
|
||||
self.keywords = [kw.strip() for kw in self.task.keyword.split(',') if kw.strip()] if self.task.keyword else []
|
||||
|
||||
# 创建带重试策略的会话
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update({
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
})
|
||||
|
||||
# 设置重试策略
|
||||
retry_strategy = Retry(
|
||||
total=3,
|
||||
backoff_factor=1,
|
||||
status_forcelist=[429, 500, 502, 503, 504],
|
||||
)
|
||||
adapter = HTTPAdapter(max_retries=retry_strategy)
|
||||
self.session.mount("http://", adapter)
|
||||
self.session.mount("https://", adapter)
|
||||
|
||||
# 设置超时
|
||||
self.timeout = 15
|
||||
|
||||
def log(self, level, message, website=None):
|
||||
"""记录日志"""
|
||||
print(f"[{level.upper()}] {message}")
|
||||
logger.log(getattr(logging, level.upper()), f"Task {self.task.id}: {message}")
|
||||
|
||||
def is_cancelled(self):
|
||||
"""检查任务是否已被取消"""
|
||||
if self.task_executor:
|
||||
return self.task_executor.is_task_cancelled(self.task_id)
|
||||
return False
|
||||
|
||||
def update_task_status(self, status, **kwargs):
|
||||
"""更新任务状态"""
|
||||
self.task.status = status
|
||||
if status == 'running' and not self.task.started_at:
|
||||
self.task.started_at = timezone.now()
|
||||
elif status in ['completed', 'failed', 'cancelled']:
|
||||
self.task.completed_at = timezone.now()
|
||||
|
||||
for key, value in kwargs.items():
|
||||
setattr(self.task, key, value)
|
||||
self.task.save()
|
||||
|
||||
def extract_text_content(self, soup):
|
||||
"""提取文本内容,保持段落结构"""
|
||||
# 移除脚本和样式标签
|
||||
for script in soup(["script", "style"]):
|
||||
script.decompose()
|
||||
|
||||
# 处理段落标签,保持段落结构
|
||||
paragraphs = []
|
||||
|
||||
# 查找所有段落相关的标签
|
||||
for element in soup.find_all(['p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'br']):
|
||||
if element.name in ['p', 'div']:
|
||||
text = element.get_text().strip()
|
||||
if text:
|
||||
paragraphs.append(text)
|
||||
elif element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
||||
text = element.get_text().strip()
|
||||
if text:
|
||||
paragraphs.append(f"\n{text}\n") # 标题前后加换行
|
||||
elif element.name == 'br':
|
||||
paragraphs.append('\n')
|
||||
|
||||
# 如果没有找到段落标签,使用原来的方法
|
||||
if not paragraphs:
|
||||
text = soup.get_text()
|
||||
# 清理文本但保持换行
|
||||
lines = []
|
||||
for line in text.splitlines():
|
||||
line = line.strip()
|
||||
if line:
|
||||
lines.append(line)
|
||||
return '\n\n'.join(lines)
|
||||
|
||||
# 合并段落,用双换行分隔
|
||||
content = '\n\n'.join(paragraphs)
|
||||
|
||||
# 清理多余的空行
|
||||
content = re.sub(r'\n\s*\n\s*\n', '\n\n', content)
|
||||
|
||||
return content.strip()
|
||||
|
||||
def clean_url(self, url):
|
||||
"""清理和修复URL"""
|
||||
try:
|
||||
# 处理空值或None
|
||||
if not url or url is None:
|
||||
return ""
|
||||
|
||||
# 修复常见的URL问题
|
||||
# 将错误的编码字符恢复
|
||||
url = str(url).replace('%C3%97', '×') # 修复 × 字符的错误编码
|
||||
url = url.replace('%E2%80%93', '–') # 修复 – 字符的错误编码
|
||||
url = url.replace('%E2%80%94', '—') # 修复 — 字符的错误编码
|
||||
|
||||
# 解析URL并重新构建
|
||||
parsed = urlparse(url)
|
||||
|
||||
# 清理查询参数
|
||||
if parsed.query:
|
||||
# 处理查询参数中的编码问题
|
||||
from urllib.parse import parse_qs, urlencode, unquote
|
||||
query_params = parse_qs(parsed.query)
|
||||
cleaned_params = {}
|
||||
|
||||
for key, values in query_params.items():
|
||||
# 解码参数名
|
||||
clean_key = unquote(key)
|
||||
# 解码参数值
|
||||
clean_values = [unquote(val) for val in values]
|
||||
cleaned_params[clean_key] = clean_values
|
||||
|
||||
# 重新构建查询字符串
|
||||
query_string = urlencode(cleaned_params, doseq=True)
|
||||
else:
|
||||
query_string = ''
|
||||
|
||||
# 重新构建URL
|
||||
clean_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
|
||||
if query_string:
|
||||
clean_url += f"?{query_string}"
|
||||
if parsed.fragment:
|
||||
clean_url += f"#{parsed.fragment}"
|
||||
|
||||
return clean_url
|
||||
|
||||
except Exception as e:
|
||||
self.log('warning', f'URL清理失败: {url}, 错误: {e}')
|
||||
return url
|
||||
|
||||
def is_valid_article_url(self, url):
|
||||
"""检查是否是有效的文章URL"""
|
||||
try:
|
||||
# 排除一些明显不是文章的URL
|
||||
exclude_patterns = [
|
||||
'javascript:', 'mailto:', '#', 'tel:',
|
||||
'.pdf', '.doc', '.docx', '.xls', '.xlsx',
|
||||
'.jpg', '.jpeg', '.png', '.gif', '.svg',
|
||||
'.mp3', '.mp4', '.avi', '.mov'
|
||||
]
|
||||
|
||||
url_lower = url.lower()
|
||||
for pattern in exclude_patterns:
|
||||
if pattern in url_lower:
|
||||
return False
|
||||
|
||||
# 检查URL长度
|
||||
if len(url) < 10:
|
||||
return False
|
||||
|
||||
# 检查是否包含文章相关的关键词
|
||||
article_keywords = ['article', 'news', 'content', 'detail', 'view', 'show', 'post']
|
||||
url_lower = url.lower()
|
||||
for keyword in article_keywords:
|
||||
if keyword in url_lower:
|
||||
return True
|
||||
|
||||
# 如果URL看起来像文章ID或路径,也认为是有效的
|
||||
if any(char.isdigit() for char in url) and len(url.split('/')) > 3:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def find_article_links(self, soup, base_url):
|
||||
"""查找文章链接"""
|
||||
links = []
|
||||
seen_urls = set() # 避免重复URL
|
||||
|
||||
# 常见的文章链接选择器
|
||||
selectors = [
|
||||
'a[href*="article"]',
|
||||
'a[href*="news"]',
|
||||
'a[href*="content"]',
|
||||
'a[href*="detail"]',
|
||||
'a[href*="view"]',
|
||||
'a[href*="show"]',
|
||||
'.news-list a',
|
||||
'.article-list a',
|
||||
'.content-list a',
|
||||
'h3 a',
|
||||
'h4 a',
|
||||
'.title a',
|
||||
'.list-item a'
|
||||
]
|
||||
|
||||
for selector in selectors:
|
||||
elements = soup.select(selector)
|
||||
for element in elements:
|
||||
href = element.get('href')
|
||||
if href:
|
||||
# 清理和修复URL
|
||||
clean_href = self.clean_url(href)
|
||||
full_url = urljoin(base_url, clean_href)
|
||||
|
||||
# 再次清理完整URL
|
||||
full_url = self.clean_url(full_url)
|
||||
|
||||
# 检查URL是否有效且未重复
|
||||
if (full_url not in seen_urls and
|
||||
self.is_valid_article_url(full_url) and
|
||||
full_url.startswith(('http://', 'https://'))):
|
||||
|
||||
title = element.get_text().strip()
|
||||
if title and len(title) > 5: # 过滤掉太短的标题
|
||||
links.append({
|
||||
'url': full_url,
|
||||
'title': title
|
||||
})
|
||||
seen_urls.add(full_url)
|
||||
|
||||
return links
|
||||
|
||||
def check_keyword_match(self, text, title):
|
||||
"""检查关键字匹配 - 改进版本"""
|
||||
matched_keywords = []
|
||||
text_lower = text.lower()
|
||||
title_lower = title.lower()
|
||||
|
||||
for keyword in self.keywords:
|
||||
keyword_lower = keyword.lower()
|
||||
# 使用改进的关键字检查函数
|
||||
if check_keyword_in_content(text, keyword) or check_keyword_in_content(title, keyword):
|
||||
matched_keywords.append(keyword)
|
||||
|
||||
return matched_keywords
|
||||
|
||||
def extract_article_content(self, url, soup):
|
||||
"""提取文章内容"""
|
||||
# 尝试多种内容选择器
|
||||
content_selectors = [
|
||||
'.article-content',
|
||||
'.content',
|
||||
'.article-body',
|
||||
'.news-content',
|
||||
'.main-content',
|
||||
'.post-content',
|
||||
'article',
|
||||
'.detail-content',
|
||||
'#content',
|
||||
'.text',
|
||||
'.box_con', # 新华网等网站使用
|
||||
'.content_area', # 央视网等网站使用
|
||||
]
|
||||
|
||||
content = ""
|
||||
for selector in content_selectors:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
content = self.extract_text_content(element)
|
||||
if len(content) > 100: # 确保内容足够长
|
||||
break
|
||||
|
||||
# 如果没找到特定内容区域,使用整个页面
|
||||
if not content or len(content) < 100:
|
||||
content = self.extract_text_content(soup)
|
||||
|
||||
return content
|
||||
|
||||
def extract_publish_date(self, soup):
|
||||
"""提取发布时间"""
|
||||
date_selectors = [
|
||||
'.publish-time',
|
||||
'.pub-time',
|
||||
'.date',
|
||||
'.time',
|
||||
'.publish-date',
|
||||
'time[datetime]',
|
||||
'.article-time',
|
||||
'.news-time',
|
||||
'.post-time',
|
||||
'.create-time',
|
||||
'.update-time',
|
||||
'.time span',
|
||||
'.date span',
|
||||
'.info span',
|
||||
'.meta span',
|
||||
'.meta-info',
|
||||
'.article-info span',
|
||||
'.news-info span',
|
||||
'.content-info span',
|
||||
'.a-shijian',
|
||||
'.l-time'
|
||||
]
|
||||
|
||||
for selector in date_selectors:
|
||||
elements = soup.select(selector)
|
||||
for element in elements:
|
||||
date_text = element.get_text().strip()
|
||||
if element.get('datetime'):
|
||||
date_text = element.get('datetime')
|
||||
|
||||
# 如果文本太短或为空,跳过
|
||||
if not date_text or len(date_text) < 4:
|
||||
continue
|
||||
|
||||
# 尝试解析日期
|
||||
try:
|
||||
from datetime import datetime
|
||||
|
||||
# 清理日期文本
|
||||
date_text = re.sub(r'发布(时间|日期)[::]?', '', date_text).strip()
|
||||
date_text = re.sub(r'时间[::]?', '', date_text).strip()
|
||||
date_text = re.sub(r'日期[::]?', '', date_text).strip()
|
||||
date_text = re.sub(r'发表于[::]?', '', date_text).strip()
|
||||
date_text = re.sub(r'更新[::]?', '', date_text).strip()
|
||||
date_text = re.sub(r'\s+', ' ', date_text).strip()
|
||||
|
||||
# 如果有 datetime 属性且是标准格式,直接使用
|
||||
if element.get('datetime'):
|
||||
datetime_attr = element.get('datetime')
|
||||
# 尝试解析常见的日期时间格式
|
||||
for fmt in [
|
||||
'%Y-%m-%d %H:%M:%S',
|
||||
'%Y-%m-%dT%H:%M:%S',
|
||||
'%Y-%m-%dT%H:%M:%S%z',
|
||||
'%Y-%m-%d %H:%M',
|
||||
'%Y-%m-%d',
|
||||
'%Y/%m/%d %H:%M:%S',
|
||||
'%Y/%m/%d %H:%M',
|
||||
'%Y/%m/%d',
|
||||
'%Y年%m月%d日 %H:%M:%S',
|
||||
'%Y年%m月%d日 %H:%M',
|
||||
'%Y年%m月%d日',
|
||||
]:
|
||||
try:
|
||||
if '%z' in fmt and '+' not in datetime_attr and datetime_attr.endswith('Z'):
|
||||
datetime_attr = datetime_attr[:-1] + '+0000'
|
||||
parsed_date = datetime.strptime(datetime_attr, fmt)
|
||||
if not timezone.is_aware(parsed_date):
|
||||
parsed_date = timezone.make_aware(parsed_date)
|
||||
return parsed_date
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
# 尝试解析从文本中提取的日期
|
||||
for fmt in [
|
||||
'%Y年%m月%d日 %H:%M:%S',
|
||||
'%Y年%m月%d日 %H:%M',
|
||||
'%Y年%m月%d日',
|
||||
'%Y-%m-%d %H:%M:%S',
|
||||
'%Y-%m-%d %H:%M',
|
||||
'%Y-%m-%d',
|
||||
'%Y/%m/%d %H:%M:%S',
|
||||
'%Y/%m/%d %H:%M',
|
||||
'%Y/%m/%d',
|
||||
'%m月%d日 %H:%M',
|
||||
'%m月%d日',
|
||||
]:
|
||||
try:
|
||||
parsed_date = datetime.strptime(date_text, fmt)
|
||||
# 如果没有年份,使用当前年份
|
||||
if '%Y' not in fmt:
|
||||
parsed_date = parsed_date.replace(year=datetime.now().year)
|
||||
if not timezone.is_aware(parsed_date):
|
||||
parsed_date = timezone.make_aware(parsed_date)
|
||||
return parsed_date
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
# 如果以上格式都不匹配,尝试使用 dateutil 解析
|
||||
try:
|
||||
from dateutil import parser
|
||||
if len(date_text) > 5 and not date_text.isdigit():
|
||||
parsed_date = parser.parse(date_text)
|
||||
if not timezone.is_aware(parsed_date):
|
||||
parsed_date = timezone.make_aware(parsed_date)
|
||||
return parsed_date
|
||||
except:
|
||||
pass
|
||||
|
||||
except Exception as e:
|
||||
self.log('debug', f'解析日期失败: {date_text}, 错误: {str(e)}')
|
||||
continue
|
||||
|
||||
return None
|
||||
|
||||
def extract_author(self, soup):
|
||||
"""提取作者信息"""
|
||||
author_selectors = [
|
||||
'.author',
|
||||
'.writer',
|
||||
'.publisher',
|
||||
'.byline',
|
||||
'.article-author',
|
||||
'.news-author',
|
||||
'.source'
|
||||
]
|
||||
|
||||
for selector in author_selectors:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
return element.get_text().strip()
|
||||
|
||||
return ""
|
||||
|
||||
def download_media_file(self, media_url, article, media_type='image', alt_text=''):
|
||||
"""下载媒体文件 - 适配现有模型结构"""
|
||||
try:
|
||||
# 检查URL是否有效
|
||||
if not media_url or not media_url.startswith(('http://', 'https://')):
|
||||
return None
|
||||
|
||||
# 请求媒体文件
|
||||
response = self.session.get(
|
||||
media_url,
|
||||
timeout=self.timeout,
|
||||
verify=False,
|
||||
stream=False
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
# 获取文件信息
|
||||
content_type = response.headers.get('content-type', '')
|
||||
file_size = len(response.content)
|
||||
|
||||
# 确定文件扩展名
|
||||
file_extension = self.get_file_extension_from_url(media_url, content_type)
|
||||
|
||||
# 生成文件名
|
||||
existing_media_count = len(article.media_files) if article.media_files else 0
|
||||
filename = f"media_{article.id}_{existing_media_count}{file_extension}"
|
||||
|
||||
# 创建媒体文件信息字典
|
||||
media_info = {
|
||||
'type': media_type,
|
||||
'original_url': media_url,
|
||||
'filename': filename,
|
||||
'file_size': file_size,
|
||||
'mime_type': content_type,
|
||||
'alt_text': alt_text,
|
||||
'downloaded_at': timezone.now().isoformat()
|
||||
}
|
||||
|
||||
# 更新文章的媒体文件列表
|
||||
if not article.media_files:
|
||||
article.media_files = [media_info]
|
||||
else:
|
||||
article.media_files.append(media_info)
|
||||
|
||||
# 保存文件到本地(这里简化处理,实际项目中可能需要更复杂的文件存储)
|
||||
self.log('info', f'媒体文件已记录: {filename} ({media_type})')
|
||||
return media_info
|
||||
|
||||
except Exception as e:
|
||||
self.log('error', f'下载媒体文件失败 {media_url}: {str(e)}')
|
||||
return None
|
||||
|
||||
def get_file_extension_from_url(self, url, content_type):
|
||||
"""从URL或内容类型获取文件扩展名"""
|
||||
# 从URL获取扩展名
|
||||
parsed_url = urlparse(url)
|
||||
path = parsed_url.path
|
||||
if '.' in path:
|
||||
return os.path.splitext(path)[1]
|
||||
|
||||
# 从内容类型获取扩展名
|
||||
content_type_map = {
|
||||
'image/jpeg': '.jpg',
|
||||
'image/jpg': '.jpg',
|
||||
'image/png': '.png',
|
||||
'image/gif': '.gif',
|
||||
'image/webp': '.webp',
|
||||
'image/svg+xml': '.svg',
|
||||
'video/mp4': '.mp4',
|
||||
'video/avi': '.avi',
|
||||
'video/mov': '.mov',
|
||||
'video/wmv': '.wmv',
|
||||
'video/flv': '.flv',
|
||||
'video/webm': '.webm',
|
||||
'audio/mp3': '.mp3',
|
||||
'audio/wav': '.wav',
|
||||
'audio/ogg': '.ogg',
|
||||
'application/pdf': '.pdf',
|
||||
'application/msword': '.doc',
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
|
||||
}
|
||||
|
||||
return content_type_map.get(content_type.lower(), '.bin')
|
||||
|
||||
def extract_and_download_media(self, soup, article, base_url):
|
||||
"""提取并下载页面中的媒体文件"""
|
||||
media_files = []
|
||||
|
||||
# 提取图片
|
||||
images = soup.find_all('img')
|
||||
self.log('info', f'找到 {len(images)} 个图片标签')
|
||||
|
||||
for img in images:
|
||||
src = img.get('src')
|
||||
if src:
|
||||
# 处理相对URL
|
||||
if src.startswith('//'):
|
||||
src = 'https:' + src
|
||||
elif src.startswith('/'):
|
||||
src = urljoin(base_url, src)
|
||||
elif not src.startswith(('http://', 'https://')):
|
||||
src = urljoin(base_url, src)
|
||||
|
||||
alt_text = img.get('alt', '')
|
||||
media_file = self.download_media_file(src, article, 'image', alt_text)
|
||||
if media_file:
|
||||
media_files.append(media_file)
|
||||
|
||||
# 提取视频
|
||||
videos = soup.find_all(['video', 'source'])
|
||||
for video in videos:
|
||||
src = video.get('src')
|
||||
if src:
|
||||
# 处理相对URL
|
||||
if src.startswith('//'):
|
||||
src = 'https:' + src
|
||||
elif src.startswith('/'):
|
||||
src = urljoin(base_url, src)
|
||||
elif not src.startswith(('http://', 'https://')):
|
||||
src = urljoin(base_url, src)
|
||||
|
||||
media_file = self.download_media_file(src, article, 'video')
|
||||
if media_file:
|
||||
media_files.append(media_file)
|
||||
|
||||
return media_files
|
||||
|
||||
def crawl_website(self, website):
|
||||
"""爬取单个网站"""
|
||||
self.log('info', f'开始爬取网站: {website.name}')
|
||||
|
||||
try:
|
||||
# 请求主页
|
||||
response = self.session.get(
|
||||
website.base_url,
|
||||
timeout=self.timeout,
|
||||
verify=False
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
# 检查内容编码
|
||||
if response.encoding != 'utf-8':
|
||||
content_type = response.headers.get('content-type', '')
|
||||
if 'charset=' in content_type:
|
||||
charset = content_type.split('charset=')[-1]
|
||||
response.encoding = charset
|
||||
else:
|
||||
response.encoding = 'utf-8'
|
||||
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
|
||||
# 查找文章链接
|
||||
article_links = self.find_article_links(soup, website.base_url)
|
||||
self.log('info', f'找到 {len(article_links)} 个文章链接')
|
||||
|
||||
crawled_count = 0
|
||||
for link_info in article_links:
|
||||
# 检查任务是否已被取消
|
||||
if self.is_cancelled():
|
||||
self.log('info', '任务已被取消,停止处理文章')
|
||||
return crawled_count
|
||||
|
||||
try:
|
||||
# 清理和验证URL
|
||||
clean_url = self.clean_url(link_info['url'])
|
||||
|
||||
# 检查URL是否仍然有效
|
||||
if not self.is_valid_article_url(clean_url):
|
||||
self.log('warning', f'跳过无效URL: {clean_url}')
|
||||
continue
|
||||
|
||||
self.log('info', f'正在处理文章: {clean_url}')
|
||||
|
||||
# 请求文章页面
|
||||
article_response = self.session.get(
|
||||
clean_url,
|
||||
timeout=self.timeout,
|
||||
verify=False
|
||||
)
|
||||
article_response.raise_for_status()
|
||||
|
||||
# 检查内容编码
|
||||
if article_response.encoding != 'utf-8':
|
||||
content_type = article_response.headers.get('content-type', '')
|
||||
if 'charset=' in content_type:
|
||||
charset = content_type.split('charset=')[-1]
|
||||
article_response.encoding = charset
|
||||
else:
|
||||
article_response.encoding = 'utf-8'
|
||||
|
||||
article_soup = BeautifulSoup(article_response.content, 'html.parser')
|
||||
|
||||
# 提取内容
|
||||
content = self.extract_article_content(clean_url, article_soup)
|
||||
title = link_info['title']
|
||||
|
||||
# 检查关键字匹配
|
||||
matched_keywords = self.check_keyword_match(content, title)
|
||||
|
||||
if matched_keywords:
|
||||
# 提取其他信息
|
||||
publish_date = self.extract_publish_date(article_soup)
|
||||
author = self.extract_author(article_soup)
|
||||
|
||||
# 检查是否已存在相同URL的文章
|
||||
existing_article = Article.objects.filter(
|
||||
url=clean_url
|
||||
).first()
|
||||
|
||||
if existing_article:
|
||||
# 如果已存在,更新现有记录
|
||||
existing_article.title = title
|
||||
existing_article.content = content
|
||||
existing_article.pub_date = publish_date
|
||||
existing_article.media_files = [] # 重置媒体文件列表
|
||||
existing_article.save()
|
||||
|
||||
# 更新媒体文件
|
||||
media_files = self.extract_and_download_media(article_soup, existing_article, clean_url)
|
||||
|
||||
self.log('info', f'更新已存在的文章: {title[:50]}...')
|
||||
else:
|
||||
# 保存新内容
|
||||
article = Article.objects.create(
|
||||
website=website,
|
||||
title=title,
|
||||
content=content,
|
||||
url=clean_url,
|
||||
pub_date=publish_date,
|
||||
media_files=[]
|
||||
)
|
||||
|
||||
# 提取并下载媒体文件
|
||||
media_files = self.extract_and_download_media(article_soup, article, clean_url)
|
||||
|
||||
self.log('info', f'保存新文章: {title[:50]}...')
|
||||
|
||||
crawled_count += 1
|
||||
|
||||
# 请求间隔
|
||||
time.sleep(1)
|
||||
|
||||
except requests.exceptions.HTTPError as e:
|
||||
if e.response.status_code == 404:
|
||||
self.log('warning', f'文章不存在 (404): {clean_url}')
|
||||
elif e.response.status_code == 403:
|
||||
self.log('warning', f'访问被拒绝 (403): {clean_url}')
|
||||
elif e.response.status_code == 429:
|
||||
self.log('warning', f'请求过于频繁 (429): {clean_url}')
|
||||
time.sleep(5) # 等待更长时间
|
||||
else:
|
||||
self.log('error', f'HTTP错误 {e.response.status_code}: {clean_url}')
|
||||
continue
|
||||
except requests.exceptions.Timeout as e:
|
||||
self.log('warning', f'请求超时: {clean_url}')
|
||||
continue
|
||||
except requests.exceptions.ConnectionError as e:
|
||||
self.log('warning', f'连接错误: {clean_url}')
|
||||
continue
|
||||
except Exception as e:
|
||||
self.log('error', f'处理文章失败 {clean_url}: {str(e)}')
|
||||
continue
|
||||
|
||||
self.log('info', f'网站爬取完成,共保存 {crawled_count} 篇文章')
|
||||
return crawled_count
|
||||
|
||||
except Exception as e:
|
||||
self.log('error', f'爬取网站失败: {str(e)}')
|
||||
return 0
|
||||
|
||||
def run(self):
|
||||
"""运行爬取任务"""
|
||||
self.log('info', f'开始执行关键词爬取任务: {self.task.name}')
|
||||
self.update_task_status('running')
|
||||
|
||||
total_crawled = 0
|
||||
websites = self.task.websites.all()
|
||||
self.task.total_pages = websites.count()
|
||||
self.task.save()
|
||||
|
||||
for website in websites:
|
||||
# 检查任务是否已被取消
|
||||
if self.is_cancelled():
|
||||
self.log('info', '任务已被取消,停止爬取')
|
||||
self.update_task_status('cancelled', error_message='任务被取消')
|
||||
return
|
||||
|
||||
try:
|
||||
crawled_count = self.crawl_website(website)
|
||||
total_crawled += crawled_count
|
||||
self.task.crawled_pages += 1
|
||||
self.task.save()
|
||||
|
||||
# 再次检查任务是否已被取消
|
||||
if self.is_cancelled():
|
||||
self.log('info', '任务已被取消,停止爬取')
|
||||
self.update_task_status('cancelled', error_message='任务被取消')
|
||||
return
|
||||
|
||||
except Exception as e:
|
||||
self.log('error', f'爬取网站 {website.name} 时发生错误: {str(e)}')
|
||||
continue
|
||||
|
||||
# 更新任务状态
|
||||
if total_crawled > 0:
|
||||
self.update_task_status('completed')
|
||||
self.log('info', f'关键词爬取任务完成,共爬取 {total_crawled} 篇文章')
|
||||
else:
|
||||
self.update_task_status('failed', error_message='没有找到匹配的内容')
|
||||
self.log('error', '关键词爬取任务失败,没有找到匹配的内容')
|
||||
|
||||
|
||||
def run_keyword_crawl_task(task_id, task_executor_instance=None):
|
||||
"""运行关键词爬取任务"""
|
||||
try:
|
||||
crawler = KeywordCrawler(task_id, task_executor_instance)
|
||||
crawler.run()
|
||||
return f"关键词爬取任务 {task_id} 执行完成"
|
||||
except Exception as e:
|
||||
# 记录异常到日志
|
||||
logger.error(f"执行关键词爬取任务 {task_id} 时发生异常: {str(e)}", exc_info=True)
|
||||
|
||||
task = CrawlTask.objects.get(id=task_id)
|
||||
task.status = 'failed'
|
||||
task.error_message = str(e)
|
||||
task.completed_at = timezone.now()
|
||||
task.save()
|
||||
|
||||
return f"关键词爬取任务 {task_id} 执行失败: {str(e)}"
|
||||
77
core/management/commands/crawl_all_media.py
Normal file
77
core/management/commands/crawl_all_media.py
Normal file
@@ -0,0 +1,77 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
from django.core.management import call_command
|
||||
from core.models import Website
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "批量爬取所有中央主流媒体"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--media', type=str, help='指定要爬取的媒体,用逗号分隔')
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
help='指定平台类型: all(全部), web(网站)')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
media_list = options['media']
|
||||
platform = options['platform']
|
||||
|
||||
# 所有中央主流媒体配置
|
||||
all_media = {
|
||||
'rmrb': 'crawl_rmrb',
|
||||
'xinhua': 'crawl_xinhua',
|
||||
'cctv': 'crawl_cctv',
|
||||
'qiushi': 'crawl_qiushi',
|
||||
'pla': 'crawl_pla',
|
||||
'gmrb': 'crawl_gmrb',
|
||||
'jjrb': 'crawl_jjrb',
|
||||
'chinadaily': 'crawl_chinadaily',
|
||||
'grrb': 'crawl_grrb',
|
||||
'kjrb': 'crawl_kjrb',
|
||||
'rmzxb': 'crawl_rmzxb',
|
||||
'zgjwjc': 'crawl_zgjwjc',
|
||||
'chinanews': 'crawl_chinanews',
|
||||
'xxsb': 'crawl_xxsb',
|
||||
'zgqnb': 'crawl_zgqnb',
|
||||
'zgfnb': 'crawl_zgfnb',
|
||||
'fzrb': 'crawl_fzrb',
|
||||
'nmrb': 'crawl_nmrb',
|
||||
'xuexi': 'crawl_xuexi',
|
||||
'qizhi': 'crawl_qizhi',
|
||||
'china': 'crawl_china'
|
||||
}
|
||||
|
||||
# 如果指定了特定媒体,则只爬取指定的媒体
|
||||
if media_list:
|
||||
target_media = [media.strip() for media in media_list.split(',')]
|
||||
else:
|
||||
target_media = list(all_media.keys())
|
||||
|
||||
self.stdout.write(f"开始批量爬取 {len(target_media)} 家中央主流媒体...")
|
||||
|
||||
for media in target_media:
|
||||
if media in all_media:
|
||||
command_name = all_media[media]
|
||||
try:
|
||||
self.stdout.write(f"正在爬取: {media}")
|
||||
call_command(command_name, platform=platform)
|
||||
self.stdout.write(self.style.SUCCESS(f"完成爬取: {media}"))
|
||||
except Exception as e:
|
||||
self.stdout.write(self.style.ERROR(f"爬取 {media} 失败: {e}"))
|
||||
else:
|
||||
self.stdout.write(self.style.WARNING(f"未知媒体: {media}"))
|
||||
|
||||
self.stdout.write(self.style.SUCCESS("所有中央主流媒体爬取完成"))
|
||||
|
||||
# 显示统计信息
|
||||
total_websites = Website.objects.count()
|
||||
total_articles = sum([website.article_set.count() for website in Website.objects.all()])
|
||||
|
||||
self.stdout.write(f"统计信息:")
|
||||
self.stdout.write(f"- 总网站数: {total_websites}")
|
||||
self.stdout.write(f"- 总文章数: {total_articles}")
|
||||
|
||||
# 显示各媒体文章数量
|
||||
self.stdout.write(f"各媒体文章数量:")
|
||||
for website in Website.objects.all():
|
||||
article_count = website.article_set.count()
|
||||
self.stdout.write(f"- {website.name}: {article_count} 篇")
|
||||
266
core/management/commands/crawl_all_websites.py
Normal file
266
core/management/commands/crawl_all_websites.py
Normal file
@@ -0,0 +1,266 @@
|
||||
import json
|
||||
from django.core.management.base import BaseCommand
|
||||
from core.utils import full_site_crawler, crawl_by_keyword, WEBSITE_SEARCH_CONFIGS
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = '一键爬取所有网站'
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument(
|
||||
'--mode', '-m',
|
||||
type=str,
|
||||
default='both',
|
||||
choices=['full', 'keyword', 'both'],
|
||||
help='爬取模式: full(全站爬取), keyword(关键词搜索), both(两者都执行)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--keyword', '-k',
|
||||
type=str,
|
||||
help='关键词搜索的关键词'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--websites', '-w',
|
||||
type=str,
|
||||
nargs='*',
|
||||
help='指定要爬取的网站列表'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--max-pages', '-p',
|
||||
type=int,
|
||||
default=500,
|
||||
help='全站爬取的最大页数'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--max-search-pages', '-P',
|
||||
type=int,
|
||||
default=10,
|
||||
help='关键词搜索的最大页数'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--max-articles', '-a',
|
||||
type=int,
|
||||
default=100,
|
||||
help='关键词搜索的最大文章数'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--start-date', '-s',
|
||||
type=str,
|
||||
help='开始日期 (YYYY-MM-DD)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--end-date', '-e',
|
||||
type=str,
|
||||
help='结束日期 (YYYY-MM-DD)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--output', '-o',
|
||||
type=str,
|
||||
help='将结果保存到JSON文件'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--skip-existing', '-S',
|
||||
action='store_true',
|
||||
help='跳过已存在的网站'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--list-websites', '-l',
|
||||
action='store_true',
|
||||
help='列出所有支持的网站'
|
||||
)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
# 列出支持的网站
|
||||
if options['list_websites']:
|
||||
self.stdout.write(self.style.SUCCESS("支持的网站列表:"))
|
||||
for i, website in enumerate(WEBSITE_SEARCH_CONFIGS.keys(), 1):
|
||||
self.stdout.write(f"{i:2d}. {website}")
|
||||
return
|
||||
|
||||
mode = options['mode']
|
||||
keyword = options['keyword']
|
||||
websites = options['websites']
|
||||
max_pages = options['max_pages']
|
||||
max_search_pages = options['max_search_pages']
|
||||
max_articles = options['max_articles']
|
||||
start_date = options['start_date']
|
||||
end_date = options['end_date']
|
||||
output_file = options['output']
|
||||
skip_existing = options['skip_existing']
|
||||
|
||||
# 验证网站名称
|
||||
if websites:
|
||||
# 确保websites是列表类型
|
||||
if isinstance(websites, str):
|
||||
websites = [websites]
|
||||
invalid_websites = [w for w in websites if w not in WEBSITE_SEARCH_CONFIGS]
|
||||
if invalid_websites:
|
||||
# 确保invalid_websites是可迭代的
|
||||
if isinstance(invalid_websites, str):
|
||||
invalid_websites = [invalid_websites]
|
||||
self.stdout.write(
|
||||
self.style.ERROR(f"不支持的网站: {', '.join(invalid_websites)}")
|
||||
)
|
||||
self.stdout.write("使用 --list-websites 查看支持的网站列表")
|
||||
return
|
||||
|
||||
# 确定要爬取的网站列表
|
||||
target_websites = websites if websites else list(WEBSITE_SEARCH_CONFIGS.keys())
|
||||
|
||||
# 验证关键词模式
|
||||
if mode in ['keyword', 'both'] and not keyword:
|
||||
self.stdout.write(
|
||||
self.style.ERROR("关键词模式需要指定 --keyword 参数")
|
||||
)
|
||||
return
|
||||
|
||||
self.stdout.write(f"开始一键爬取任务...")
|
||||
self.stdout.write(f"爬取模式: {mode}")
|
||||
# 确保target_websites是可迭代的
|
||||
if isinstance(target_websites, str):
|
||||
target_websites = [target_websites]
|
||||
self.stdout.write(f"目标网站: {', '.join(target_websites)}")
|
||||
if keyword:
|
||||
self.stdout.write(f"关键词: {keyword}")
|
||||
if start_date:
|
||||
self.stdout.write(f"开始日期: {start_date}")
|
||||
if end_date:
|
||||
self.stdout.write(f"结束日期: {end_date}")
|
||||
|
||||
all_results = {
|
||||
"mode": mode,
|
||||
"websites": target_websites,
|
||||
"keyword": keyword,
|
||||
"start_date": start_date,
|
||||
"end_date": end_date,
|
||||
"full_crawl_results": {},
|
||||
"keyword_crawl_results": {},
|
||||
"summary": {
|
||||
"total_websites": len(target_websites),
|
||||
"full_crawl_success": 0,
|
||||
"full_crawl_failed": 0,
|
||||
"keyword_crawl_success": 0,
|
||||
"keyword_crawl_failed": 0
|
||||
}
|
||||
}
|
||||
|
||||
try:
|
||||
for website_name in target_websites:
|
||||
self.stdout.write(f"\n{'='*50}")
|
||||
self.stdout.write(f"开始处理网站: {website_name}")
|
||||
self.stdout.write(f"{'='*50}")
|
||||
|
||||
# 获取或创建网站对象
|
||||
from core.models import Website
|
||||
website, created = Website.objects.get_or_create(
|
||||
name=website_name,
|
||||
defaults={
|
||||
'base_url': WEBSITE_SEARCH_CONFIGS[website_name]["search_url"],
|
||||
'enabled': True
|
||||
}
|
||||
)
|
||||
|
||||
if not created and skip_existing:
|
||||
self.stdout.write(f"跳过已存在的网站: {website_name}")
|
||||
continue
|
||||
|
||||
website_results = {
|
||||
"full_crawl": None,
|
||||
"keyword_crawl": None
|
||||
}
|
||||
|
||||
# 全站爬取
|
||||
if mode in ['full', 'both']:
|
||||
self.stdout.write(f"\n开始全站爬取: {website_name}")
|
||||
try:
|
||||
full_site_crawler(
|
||||
WEBSITE_SEARCH_CONFIGS[website_name]["search_url"],
|
||||
website,
|
||||
max_pages=max_pages
|
||||
)
|
||||
self.stdout.write(self.style.SUCCESS(f"全站爬取完成: {website_name}"))
|
||||
website_results["full_crawl"] = {"status": "success"}
|
||||
all_results["summary"]["full_crawl_success"] += 1
|
||||
except Exception as e:
|
||||
self.stdout.write(self.style.ERROR(f"全站爬取失败: {website_name}, 错误: {e}"))
|
||||
website_results["full_crawl"] = {"status": "failed", "error": str(e)}
|
||||
all_results["summary"]["full_crawl_failed"] += 1
|
||||
|
||||
# 关键词爬取
|
||||
if mode in ['keyword', 'both']:
|
||||
self.stdout.write(f"\n开始关键词爬取: {website_name}")
|
||||
try:
|
||||
keyword_results = crawl_by_keyword(
|
||||
keyword=keyword,
|
||||
website_names=[website_name],
|
||||
max_pages=max_search_pages,
|
||||
start_date=start_date,
|
||||
end_date=end_date,
|
||||
max_articles=max_articles
|
||||
)
|
||||
website_results["keyword_crawl"] = keyword_results
|
||||
if keyword_results["success_count"] > 0:
|
||||
all_results["summary"]["keyword_crawl_success"] += 1
|
||||
else:
|
||||
all_results["summary"]["keyword_crawl_failed"] += 1
|
||||
except Exception as e:
|
||||
self.stdout.write(self.style.ERROR(f"关键词爬取失败: {website_name}, 错误: {e}"))
|
||||
website_results["keyword_crawl"] = {"status": "failed", "error": str(e)}
|
||||
all_results["summary"]["keyword_crawl_failed"] += 1
|
||||
|
||||
all_results["full_crawl_results"][website_name] = website_results["full_crawl"]
|
||||
all_results["keyword_crawl_results"][website_name] = website_results["keyword_crawl"]
|
||||
|
||||
# 显示最终结果摘要
|
||||
self.stdout.write(f"\n{'='*50}")
|
||||
self.stdout.write(self.style.SUCCESS("一键爬取完成!"))
|
||||
self.stdout.write(f"{'='*50}")
|
||||
self.stdout.write(f"总网站数: {all_results['summary']['total_websites']}")
|
||||
|
||||
if mode in ['full', 'both']:
|
||||
self.stdout.write(f"全站爬取 - 成功: {all_results['summary']['full_crawl_success']}, "
|
||||
f"失败: {all_results['summary']['full_crawl_failed']}")
|
||||
|
||||
if mode in ['keyword', 'both']:
|
||||
self.stdout.write(f"关键词爬取 - 成功: {all_results['summary']['keyword_crawl_success']}, "
|
||||
f"失败: {all_results['summary']['keyword_crawl_failed']}")
|
||||
|
||||
# 显示各网站详细结果
|
||||
self.stdout.write("\n各网站详细结果:")
|
||||
for website_name in target_websites:
|
||||
self.stdout.write(f"\n{website_name}:")
|
||||
|
||||
if mode in ['full', 'both']:
|
||||
full_result = all_results["full_crawl_results"][website_name]
|
||||
if full_result and full_result.get("status") == "success":
|
||||
self.stdout.write(self.style.SUCCESS(" 全站爬取: 成功"))
|
||||
elif full_result:
|
||||
self.stdout.write(self.style.ERROR(f" 全站爬取: 失败 - {full_result.get('error', '未知错误')}"))
|
||||
|
||||
if mode in ['keyword', 'both']:
|
||||
keyword_result = all_results["keyword_crawl_results"][website_name]
|
||||
if keyword_result and "success_count" in keyword_result:
|
||||
self.stdout.write(f" 关键词爬取: 成功 {keyword_result['success_count']} 篇, "
|
||||
f"失败 {keyword_result['failed_count']} 篇")
|
||||
elif keyword_result and keyword_result.get("status") == "failed":
|
||||
self.stdout.write(self.style.ERROR(f" 关键词爬取: 失败 - {keyword_result.get('error', '未知错误')}"))
|
||||
|
||||
# 保存结果到文件
|
||||
if output_file:
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(all_results, f, ensure_ascii=False, indent=2)
|
||||
self.stdout.write(f"\n结果已保存到: {output_file}")
|
||||
|
||||
except Exception as e:
|
||||
self.stdout.write(self.style.ERROR(f"一键爬取过程中出现错误: {e}"))
|
||||
raise
|
||||
168
core/management/commands/crawl_by_keyword.py
Normal file
168
core/management/commands/crawl_by_keyword.py
Normal file
@@ -0,0 +1,168 @@
|
||||
import json
|
||||
from django.core.management.base import BaseCommand
|
||||
from core.utils import crawl_by_keyword, WEBSITE_SEARCH_CONFIGS
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = '根据关键词爬取文章'
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument(
|
||||
'--keyword', '-k',
|
||||
type=str,
|
||||
required=True,
|
||||
help='搜索关键词'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--websites', '-w',
|
||||
type=str,
|
||||
nargs='*',
|
||||
help='指定要爬取的网站列表'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--max-pages', '-p',
|
||||
type=int,
|
||||
default=10,
|
||||
help='每个网站最大搜索页数'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--max-articles', '-m',
|
||||
type=int,
|
||||
default=100,
|
||||
help='最大文章数量'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--start-date', '-s',
|
||||
type=str,
|
||||
help='开始日期 (YYYY-MM-DD)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--end-date', '-e',
|
||||
type=str,
|
||||
help='结束日期 (YYYY-MM-DD)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--historical', '-H',
|
||||
action='store_true',
|
||||
help='使用历史文章爬取模式'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--list-websites', '-l',
|
||||
action='store_true',
|
||||
help='列出所有支持的网站'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--output', '-o',
|
||||
type=str,
|
||||
help='将结果保存到JSON文件'
|
||||
)
|
||||
|
||||
def handle(self, *args, **options):
|
||||
# 列出支持的网站
|
||||
if options['list_websites']:
|
||||
self.stdout.write(self.style.SUCCESS("支持的网站列表:"))
|
||||
for i, website in enumerate(WEBSITE_SEARCH_CONFIGS.keys(), 1):
|
||||
self.stdout.write(f"{i:2d}. {website}")
|
||||
return
|
||||
|
||||
keyword = options['keyword']
|
||||
if not keyword:
|
||||
self.stdout.write(self.style.ERROR("必须指定 --keyword 参数"))
|
||||
return
|
||||
websites = options['websites']
|
||||
max_pages = options['max_pages']
|
||||
max_articles = options['max_articles']
|
||||
start_date = options['start_date']
|
||||
end_date = options['end_date']
|
||||
historical = options['historical']
|
||||
output_file = options['output']
|
||||
|
||||
# 验证网站名称
|
||||
if websites:
|
||||
# 确保websites是列表类型
|
||||
if isinstance(websites, str):
|
||||
websites = [websites]
|
||||
invalid_websites = [w for w in websites if w not in WEBSITE_SEARCH_CONFIGS]
|
||||
if invalid_websites:
|
||||
# 确保invalid_websites是可迭代的
|
||||
if isinstance(invalid_websites, str):
|
||||
invalid_websites = [invalid_websites]
|
||||
self.stdout.write(
|
||||
self.style.ERROR(f"不支持的网站: {', '.join(invalid_websites)}")
|
||||
)
|
||||
self.stdout.write("使用 --list-websites 查看支持的网站列表")
|
||||
return
|
||||
|
||||
self.stdout.write(f"开始爬取任务...")
|
||||
self.stdout.write(f"关键词: {keyword}")
|
||||
# 确保websites是可迭代的
|
||||
if websites:
|
||||
if isinstance(websites, str):
|
||||
websites = [websites]
|
||||
self.stdout.write(f"目标网站: {', '.join(websites)}")
|
||||
else:
|
||||
self.stdout.write(f"目标网站: 所有支持的网站 ({len(WEBSITE_SEARCH_CONFIGS)}个)")
|
||||
|
||||
if start_date:
|
||||
self.stdout.write(f"开始日期: {start_date}")
|
||||
if end_date:
|
||||
self.stdout.write(f"结束日期: {end_date}")
|
||||
self.stdout.write(f"最大页数: {max_pages}")
|
||||
self.stdout.write(f"最大文章数: {max_articles}")
|
||||
|
||||
try:
|
||||
if historical:
|
||||
# 历史文章爬取模式
|
||||
self.stdout.write(self.style.WARNING("使用历史文章爬取模式"))
|
||||
from core.utils import crawl_historical_articles
|
||||
results = crawl_historical_articles(
|
||||
website_names=websites,
|
||||
start_date=start_date,
|
||||
end_date=end_date,
|
||||
max_articles_per_site=max_articles
|
||||
)
|
||||
else:
|
||||
# 关键词搜索模式
|
||||
results = crawl_by_keyword(
|
||||
keyword=keyword,
|
||||
website_names=websites,
|
||||
max_pages=max_pages,
|
||||
start_date=start_date,
|
||||
end_date=end_date,
|
||||
max_articles=max_articles
|
||||
)
|
||||
|
||||
# 显示结果摘要
|
||||
self.stdout.write(self.style.SUCCESS("\n爬取完成!"))
|
||||
self.stdout.write(f"总文章数: {results['total_articles']}")
|
||||
self.stdout.write(f"成功: {results['success_count']}")
|
||||
self.stdout.write(f"失败: {results['failed_count']}")
|
||||
|
||||
# 显示各网站详细结果
|
||||
self.stdout.write("\n各网站结果:")
|
||||
for website, result in results['website_results'].items():
|
||||
status = self.style.SUCCESS if result['success'] > 0 else self.style.WARNING
|
||||
self.stdout.write(
|
||||
status(f" {website}: 找到 {result['found_urls']} 篇, "
|
||||
f"成功 {result['success']}, 失败 {result['failed']}")
|
||||
)
|
||||
if 'error' in result:
|
||||
self.stdout.write(self.style.ERROR(f" 错误: {result['error']}"))
|
||||
|
||||
# 保存结果到文件
|
||||
if output_file:
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(results, f, ensure_ascii=False, indent=2)
|
||||
self.stdout.write(f"\n结果已保存到: {output_file}")
|
||||
|
||||
except Exception as e:
|
||||
self.stdout.write(self.style.ERROR(f"爬取过程中出现错误: {e}"))
|
||||
raise
|
||||
61
core/management/commands/crawl_cctv.py
Normal file
61
core/management/commands/crawl_cctv.py
Normal file
@@ -0,0 +1,61 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
from core.models import Website
|
||||
from core.utils import full_site_crawler
|
||||
|
||||
|
||||
# jimmy.fang:20250815: 因 CCTV 的视频有做加密动作,无法下载,移除支持
|
||||
class Command(BaseCommand):
|
||||
help = "全站递归爬取 中央广播电视总台及其子网站、客户端、新媒体平台"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
choices=['cctvnews', 'all'],
|
||||
help='选择爬取平台: cctv(央视网), cctvnews(央视新闻), all(全部)')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
platform = options['platform']
|
||||
|
||||
# 中央广播电视总台各平台配置
|
||||
platforms = {
|
||||
# jimmy.fang:20250815: 因 CCTV 的视频有做加密动作,无法下载,移除支持
|
||||
# 'cctv': {
|
||||
# 'name': '央视网',
|
||||
# 'base_url': 'https://www.cctv.com',
|
||||
# 'start_url': 'https://www.cctv.com',
|
||||
# 'article_selector': 'a'
|
||||
# },
|
||||
'cctvnews': {
|
||||
'name': '央视新闻',
|
||||
'base_url': 'https://news.cctv.com',
|
||||
'start_url': 'https://news.cctv.com',
|
||||
'article_selector': 'a'
|
||||
}
|
||||
}
|
||||
|
||||
if platform == 'all':
|
||||
target_platforms = platforms.values()
|
||||
else:
|
||||
target_platforms = [platforms[platform]]
|
||||
|
||||
for platform_config in target_platforms:
|
||||
website, created = Website.objects.get_or_create(
|
||||
name=platform_config['name'],
|
||||
defaults={
|
||||
'base_url': platform_config['base_url'],
|
||||
'article_list_url': platform_config['start_url'],
|
||||
'article_selector': platform_config['article_selector']
|
||||
}
|
||||
)
|
||||
|
||||
# 确保更新已存在的网站对象的配置
|
||||
if not created:
|
||||
website.base_url = platform_config['base_url']
|
||||
website.article_list_url = platform_config['start_url']
|
||||
website.article_selector = platform_config['article_selector']
|
||||
website.save()
|
||||
|
||||
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
|
||||
full_site_crawler(platform_config['start_url'], website, max_pages=500)
|
||||
self.stdout.write(f"完成爬取: {platform_config['name']}")
|
||||
|
||||
self.stdout.write(self.style.SUCCESS("中央广播电视总台所有平台爬取完成"))
|
||||
54
core/management/commands/crawl_china.py
Normal file
54
core/management/commands/crawl_china.py
Normal file
@@ -0,0 +1,54 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
from core.models import Website
|
||||
from core.utils import full_site_crawler
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "全站递归爬取 中国网主网"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
choices=['china', 'all'],
|
||||
help='选择爬取平台: china(中国网主网), all(全部)')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
platform = options['platform']
|
||||
|
||||
# 中国网各平台配置
|
||||
platforms = {
|
||||
'china': {
|
||||
'name': '中国网',
|
||||
'base_url': 'http://www.china.com.cn',
|
||||
'start_url': 'http://www.china.com.cn',
|
||||
'article_selector': 'a'
|
||||
},
|
||||
|
||||
}
|
||||
|
||||
if platform == 'all':
|
||||
target_platforms = platforms.values()
|
||||
else:
|
||||
target_platforms = [platforms[platform]]
|
||||
|
||||
for platform_config in target_platforms:
|
||||
website, created = Website.objects.get_or_create(
|
||||
name=platform_config['name'],
|
||||
defaults={
|
||||
'base_url': platform_config['base_url'],
|
||||
'article_list_url': platform_config['start_url'],
|
||||
'article_selector': platform_config['article_selector']
|
||||
}
|
||||
)
|
||||
|
||||
# 确保更新已存在的网站对象的配置
|
||||
if not created:
|
||||
website.base_url = platform_config['base_url']
|
||||
website.article_list_url = platform_config['start_url']
|
||||
website.article_selector = platform_config['article_selector']
|
||||
website.save()
|
||||
|
||||
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
|
||||
full_site_crawler(platform_config['start_url'], website, max_pages=500)
|
||||
self.stdout.write(f"完成爬取: {platform_config['name']}")
|
||||
|
||||
self.stdout.write(self.style.SUCCESS("中国网所有平台爬取完成"))
|
||||
54
core/management/commands/crawl_chinadaily.py
Normal file
54
core/management/commands/crawl_chinadaily.py
Normal file
@@ -0,0 +1,54 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
from core.models import Website
|
||||
from core.utils import full_site_crawler
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "全站递归爬取 中国日报平台"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
choices=['chinadaily','all'],
|
||||
help='选择爬取平台: chinadaily(中国日报), all(全部)')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
platform = options['platform']
|
||||
|
||||
# 中国日报各平台配置
|
||||
platforms = {
|
||||
'chinadaily': {
|
||||
'name': '中国日报',
|
||||
'base_url': 'https://www.chinadaily.com.cn',
|
||||
'start_url': 'https://www.chinadaily.com.cn',
|
||||
'article_selector': 'a'
|
||||
},
|
||||
|
||||
}
|
||||
|
||||
if platform == 'all':
|
||||
target_platforms = platforms.values()
|
||||
else:
|
||||
target_platforms = [platforms[platform]]
|
||||
|
||||
for platform_config in target_platforms:
|
||||
website, created = Website.objects.get_or_create(
|
||||
name=platform_config['name'],
|
||||
defaults={
|
||||
'base_url': platform_config['base_url'],
|
||||
'article_list_url': platform_config['start_url'],
|
||||
'article_selector': platform_config['article_selector']
|
||||
}
|
||||
)
|
||||
|
||||
# 确保更新已存在的网站对象的配置
|
||||
if not created:
|
||||
website.base_url = platform_config['base_url']
|
||||
website.article_list_url = platform_config['start_url']
|
||||
website.article_selector = platform_config['article_selector']
|
||||
website.save()
|
||||
|
||||
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
|
||||
full_site_crawler(platform_config['start_url'], website, max_pages=500)
|
||||
self.stdout.write(f"完成爬取: {platform_config['name']}")
|
||||
|
||||
self.stdout.write(self.style.SUCCESS("中国日报所有平台爬取完成"))
|
||||
53
core/management/commands/crawl_chinanews.py
Normal file
53
core/management/commands/crawl_chinanews.py
Normal file
@@ -0,0 +1,53 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
from core.models import Website
|
||||
from core.utils import full_site_crawler
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "全站递归爬取 中国新闻社平台"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
choices=['chinanews', 'all'],
|
||||
help='选择爬取平台: chinanews(中国新闻社), all(全部)')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
platform = options['platform']
|
||||
|
||||
# 中国新闻社各平台配置
|
||||
platforms = {
|
||||
'chinanews': {
|
||||
'name': '中国新闻社',
|
||||
'base_url': 'https://www.chinanews.com.cn',
|
||||
'start_url': 'https://www.chinanews.com.cn',
|
||||
'article_selector': 'a'
|
||||
},
|
||||
}
|
||||
|
||||
if platform == 'all':
|
||||
target_platforms = platforms.values()
|
||||
else:
|
||||
target_platforms = [platforms[platform]]
|
||||
|
||||
for platform_config in target_platforms:
|
||||
website, created = Website.objects.get_or_create(
|
||||
name=platform_config['name'],
|
||||
defaults={
|
||||
'base_url': platform_config['base_url'],
|
||||
'article_list_url': platform_config['start_url'],
|
||||
'article_selector': platform_config['article_selector']
|
||||
}
|
||||
)
|
||||
|
||||
# 确保更新已存在的网站对象的配置
|
||||
if not created:
|
||||
website.base_url = platform_config['base_url']
|
||||
website.article_list_url = platform_config['start_url']
|
||||
website.article_selector = platform_config['article_selector']
|
||||
website.save()
|
||||
|
||||
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
|
||||
full_site_crawler(platform_config['start_url'], website, max_pages=500)
|
||||
self.stdout.write(f"完成爬取: {platform_config['name']}")
|
||||
|
||||
self.stdout.write(self.style.SUCCESS("中国新闻社所有平台爬取完成"))
|
||||
@@ -4,17 +4,50 @@ from core.utils import full_site_crawler
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "全站递归爬取 www.gov.cn"
|
||||
help = "全站递归爬取 中国政府网及其子网站"
|
||||
|
||||
def handle(self, *args, **kwargs):
|
||||
website, created = Website.objects.get_or_create(
|
||||
name="www.gov.cn",
|
||||
defaults={
|
||||
'article_list_url': 'https://www.gov.cn/',
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
choices=['govcn', 'all'],
|
||||
help='选择爬取平台: govcn(中国政府网), all(全部)')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
platform = options['platform']
|
||||
|
||||
# 中国政府网各平台配置
|
||||
platforms = {
|
||||
'govcn': {
|
||||
'name': '中国政府网',
|
||||
'base_url': 'https://www.gov.cn/',
|
||||
'start_url': 'https://www.gov.cn/',
|
||||
'article_selector': 'a'
|
||||
}
|
||||
)
|
||||
start_url = "https://www.gov.cn/"
|
||||
self.stdout.write(f"开始全站爬取: {start_url}")
|
||||
full_site_crawler(start_url, website, max_pages=500)
|
||||
self.stdout.write("爬取完成")
|
||||
},
|
||||
}
|
||||
|
||||
if platform == 'all':
|
||||
target_platforms = platforms.values()
|
||||
else:
|
||||
target_platforms = [platforms[platform]]
|
||||
|
||||
for platform_config in target_platforms:
|
||||
website, created = Website.objects.get_or_create(
|
||||
name=platform_config['name'],
|
||||
defaults={
|
||||
'base_url': platform_config['base_url'],
|
||||
'article_list_url': platform_config['start_url'],
|
||||
'article_selector': platform_config['article_selector']
|
||||
}
|
||||
)
|
||||
|
||||
# 确保更新已存在的网站对象的配置
|
||||
if not created:
|
||||
website.base_url = platform_config['base_url']
|
||||
website.article_list_url = platform_config['start_url']
|
||||
website.article_selector = platform_config['article_selector']
|
||||
website.save()
|
||||
|
||||
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
|
||||
full_site_crawler(platform_config['start_url'], website, max_pages=500)
|
||||
self.stdout.write(f"完成爬取: {platform_config['name']}")
|
||||
|
||||
self.stdout.write(self.style.SUCCESS("中国政府网所有平台爬取完成"))
|
||||
|
||||
@@ -6,15 +6,48 @@ from core.utils import full_site_crawler
|
||||
class Command(BaseCommand):
|
||||
help = "全站递归爬取 东方烟草报"
|
||||
|
||||
def handle(self, *args, **kwargs):
|
||||
website, created = Website.objects.get_or_create(
|
||||
name="东方烟草报",
|
||||
defaults={
|
||||
'article_list_url': 'https://www.eastobacco.com/',
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
choices=['eastobacco', 'all'],
|
||||
help='选择爬取平台: eastobacco(东方烟草报), all(全部)')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
platform = options['platform']
|
||||
|
||||
# 东方烟草报各平台配置
|
||||
platforms = {
|
||||
'eastobacco': {
|
||||
'name': '东方烟草报',
|
||||
'base_url': 'https://www.eastobacco.com/',
|
||||
'start_url': 'https://www.eastobacco.com/',
|
||||
'article_selector': 'a'
|
||||
}
|
||||
)
|
||||
start_url = "https://www.eastobacco.com/"
|
||||
self.stdout.write(f"开始全站爬取: {start_url}")
|
||||
full_site_crawler(start_url, website, max_pages=500)
|
||||
self.stdout.write("爬取完成")
|
||||
},
|
||||
}
|
||||
|
||||
if platform == 'all':
|
||||
target_platforms = platforms.values()
|
||||
else:
|
||||
target_platforms = [platforms[platform]]
|
||||
|
||||
for platform_config in target_platforms:
|
||||
website, created = Website.objects.get_or_create(
|
||||
name=platform_config['name'],
|
||||
defaults={
|
||||
'base_url': platform_config['base_url'],
|
||||
'article_list_url': platform_config['start_url'],
|
||||
'article_selector': platform_config['article_selector']
|
||||
}
|
||||
)
|
||||
|
||||
# 确保更新已存在的网站对象的配置
|
||||
if not created:
|
||||
website.base_url = platform_config['base_url']
|
||||
website.article_list_url = platform_config['start_url']
|
||||
website.article_selector = platform_config['article_selector']
|
||||
website.save()
|
||||
|
||||
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
|
||||
full_site_crawler(platform_config['start_url'], website, max_pages=500)
|
||||
self.stdout.write(f"完成爬取: {platform_config['name']}")
|
||||
|
||||
self.stdout.write(self.style.SUCCESS("东方烟草报所有平台爬取完成"))
|
||||
|
||||
53
core/management/commands/crawl_fzrb.py
Normal file
53
core/management/commands/crawl_fzrb.py
Normal file
@@ -0,0 +1,53 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
from core.models import Website
|
||||
from core.utils import full_site_crawler
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "全站递归爬取 法治日报平台"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
choices=['fzrb', 'all'],
|
||||
help='选择爬取平台: fzrb(法治日报), all(全部)')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
platform = options['platform']
|
||||
|
||||
# 法治日报各平台配置
|
||||
platforms = {
|
||||
'fzrb': {
|
||||
'name': '法治日报',
|
||||
'base_url': 'http://www.legaldaily.com.cn',
|
||||
'start_url': 'http://www.legaldaily.com.cn',
|
||||
'article_selector': 'a'
|
||||
},
|
||||
}
|
||||
|
||||
if platform == 'all':
|
||||
target_platforms = platforms.values()
|
||||
else:
|
||||
target_platforms = [platforms[platform]]
|
||||
|
||||
for platform_config in target_platforms:
|
||||
website, created = Website.objects.get_or_create(
|
||||
name=platform_config['name'],
|
||||
defaults={
|
||||
'base_url': platform_config['base_url'],
|
||||
'article_list_url': platform_config['start_url'],
|
||||
'article_selector': platform_config['article_selector']
|
||||
}
|
||||
)
|
||||
|
||||
# 确保更新已存在的网站对象的配置
|
||||
if not created:
|
||||
website.base_url = platform_config['base_url']
|
||||
website.article_list_url = platform_config['start_url']
|
||||
website.article_selector = platform_config['article_selector']
|
||||
website.save()
|
||||
|
||||
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
|
||||
full_site_crawler(platform_config['start_url'], website, max_pages=500)
|
||||
self.stdout.write(f"完成爬取: {platform_config['name']}")
|
||||
|
||||
self.stdout.write(self.style.SUCCESS("法治日报所有平台爬取完成"))
|
||||
54
core/management/commands/crawl_gmrb.py
Normal file
54
core/management/commands/crawl_gmrb.py
Normal file
@@ -0,0 +1,54 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
from core.models import Website
|
||||
from core.utils import full_site_crawler
|
||||
|
||||
|
||||
# jimmy.fang-20250815: 取消对光明日报的支持,光明日报反爬,被阻挡
|
||||
class Command(BaseCommand):
|
||||
help = "全站递归爬取 光明日报平台"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
choices=['gmrb', 'all'],
|
||||
help='选择爬取平台: gmrb(光明日报), all(全部)')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
platform = options['platform']
|
||||
|
||||
# 光明日报各平台配置
|
||||
platforms = {
|
||||
'gmrb': {
|
||||
'name': '光明日报',
|
||||
'base_url': 'https://www.gmw.cn',
|
||||
'start_url': 'https://www.gmw.cn',
|
||||
'article_selector': 'a'
|
||||
},
|
||||
}
|
||||
|
||||
if platform == 'all':
|
||||
target_platforms = platforms.values()
|
||||
else:
|
||||
target_platforms = [platforms[platform]]
|
||||
|
||||
for platform_config in target_platforms:
|
||||
website, created = Website.objects.get_or_create(
|
||||
name=platform_config['name'],
|
||||
defaults={
|
||||
'base_url': platform_config['base_url'],
|
||||
'article_list_url': platform_config['start_url'],
|
||||
'article_selector': platform_config['article_selector']
|
||||
}
|
||||
)
|
||||
|
||||
# 确保更新已存在的网站对象的配置
|
||||
if not created:
|
||||
website.base_url = platform_config['base_url']
|
||||
website.article_list_url = platform_config['start_url']
|
||||
website.article_selector = platform_config['article_selector']
|
||||
website.save()
|
||||
|
||||
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
|
||||
full_site_crawler(platform_config['start_url'], website, max_pages=500)
|
||||
self.stdout.write(f"完成爬取: {platform_config['name']}")
|
||||
|
||||
self.stdout.write(self.style.SUCCESS("光明日报所有平台爬取完成"))
|
||||
53
core/management/commands/crawl_grrb.py
Normal file
53
core/management/commands/crawl_grrb.py
Normal file
@@ -0,0 +1,53 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
from core.models import Website
|
||||
from core.utils import full_site_crawler
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "全站递归爬取 工人日报平台"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
choices=['grrb', 'all'],
|
||||
help='选择爬取平台: grrb(工人日报), all(全部)')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
platform = options['platform']
|
||||
|
||||
# 工人日报各平台配置
|
||||
platforms = {
|
||||
'grrb': {
|
||||
'name': '工人日报',
|
||||
'base_url': 'http://www.workercn.cn',
|
||||
'start_url': 'http://www.workercn.cn',
|
||||
'article_selector': 'a'
|
||||
},
|
||||
}
|
||||
|
||||
if platform == 'all':
|
||||
target_platforms = platforms.values()
|
||||
else:
|
||||
target_platforms = [platforms[platform]]
|
||||
|
||||
for platform_config in target_platforms:
|
||||
website, created = Website.objects.get_or_create(
|
||||
name=platform_config['name'],
|
||||
defaults={
|
||||
'base_url': platform_config['base_url'],
|
||||
'article_list_url': platform_config['start_url'],
|
||||
'article_selector': platform_config['article_selector']
|
||||
}
|
||||
)
|
||||
|
||||
# 确保更新已存在的网站对象的配置
|
||||
if not created:
|
||||
website.base_url = platform_config['base_url']
|
||||
website.article_list_url = platform_config['start_url']
|
||||
website.article_selector = platform_config['article_selector']
|
||||
website.save()
|
||||
|
||||
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
|
||||
full_site_crawler(platform_config['start_url'], website, max_pages=500)
|
||||
self.stdout.write(f"完成爬取: {platform_config['name']}")
|
||||
|
||||
self.stdout.write(self.style.SUCCESS("工人日报所有平台爬取完成"))
|
||||
53
core/management/commands/crawl_jjrb.py
Normal file
53
core/management/commands/crawl_jjrb.py
Normal file
@@ -0,0 +1,53 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
from core.models import Website
|
||||
from core.utils import full_site_crawler
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "全站递归爬取 经济日报平台"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
choices=['jjrb', 'all'],
|
||||
help='选择爬取平台: jjrb(经济日报), all(全部)')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
platform = options['platform']
|
||||
|
||||
# 经济日报各平台配置
|
||||
platforms = {
|
||||
'jjrb': {
|
||||
'name': '经济日报',
|
||||
'base_url': 'http://www.ce.cn',
|
||||
'start_url': 'http://www.ce.cn',
|
||||
'article_selector': 'a'
|
||||
},
|
||||
}
|
||||
|
||||
if platform == 'all':
|
||||
target_platforms = platforms.values()
|
||||
else:
|
||||
target_platforms = [platforms[platform]]
|
||||
|
||||
for platform_config in target_platforms:
|
||||
website, created = Website.objects.get_or_create(
|
||||
name=platform_config['name'],
|
||||
defaults={
|
||||
'base_url': platform_config['base_url'],
|
||||
'article_list_url': platform_config['start_url'],
|
||||
'article_selector': platform_config['article_selector']
|
||||
}
|
||||
)
|
||||
|
||||
# 确保更新已存在的网站对象的配置
|
||||
if not created:
|
||||
website.base_url = platform_config['base_url']
|
||||
website.article_list_url = platform_config['start_url']
|
||||
website.article_selector = platform_config['article_selector']
|
||||
website.save()
|
||||
|
||||
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
|
||||
full_site_crawler(platform_config['start_url'], website, max_pages=500)
|
||||
self.stdout.write(f"完成爬取: {platform_config['name']}")
|
||||
|
||||
self.stdout.write(self.style.SUCCESS("经济日报所有平台爬取完成"))
|
||||
54
core/management/commands/crawl_kjrb.py
Normal file
54
core/management/commands/crawl_kjrb.py
Normal file
@@ -0,0 +1,54 @@
|
||||
# jimmy.fang-20250815: 不支援
|
||||
from django.core.management.base import BaseCommand
|
||||
from core.models import Website
|
||||
from core.utils import full_site_crawler
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "全站递归爬取 科技日报平台"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
choices=['kjrb', 'all'],
|
||||
help='选择爬取平台: kjrb(科技日报), all(全部)')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
platform = options['platform']
|
||||
|
||||
# 科技日报各平台配置
|
||||
platforms = {
|
||||
'kjrb': {
|
||||
'name': '科技日报',
|
||||
'base_url': 'http://digitalpaper.stdaily.com',
|
||||
'start_url': 'http://digitalpaper.stdaily.com',
|
||||
'article_selector': 'a'
|
||||
},
|
||||
}
|
||||
|
||||
if platform == 'all':
|
||||
target_platforms = platforms.values()
|
||||
else:
|
||||
target_platforms = [platforms[platform]]
|
||||
|
||||
for platform_config in target_platforms:
|
||||
website, created = Website.objects.get_or_create(
|
||||
name=platform_config['name'],
|
||||
defaults={
|
||||
'base_url': platform_config['base_url'],
|
||||
'article_list_url': platform_config['start_url'],
|
||||
'article_selector': platform_config['article_selector']
|
||||
}
|
||||
)
|
||||
|
||||
# 确保更新已存在的网站对象的配置
|
||||
if not created:
|
||||
website.base_url = platform_config['base_url']
|
||||
website.article_list_url = platform_config['start_url']
|
||||
website.article_selector = platform_config['article_selector']
|
||||
website.save()
|
||||
|
||||
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
|
||||
full_site_crawler(platform_config['start_url'], website, max_pages=500)
|
||||
self.stdout.write(f"完成爬取: {platform_config['name']}")
|
||||
|
||||
self.stdout.write(self.style.SUCCESS("科技日报所有平台爬取完成"))
|
||||
53
core/management/commands/crawl_nmrb.py
Normal file
53
core/management/commands/crawl_nmrb.py
Normal file
@@ -0,0 +1,53 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
from core.models import Website
|
||||
from core.utils import full_site_crawler
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "全站递归爬取 农民日报及其子网站、客户端、新媒体平台"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
choices=['nmrb', 'all'],
|
||||
help='选择爬取平台: nmrb(农民日报), all(全部)')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
platform = options['platform']
|
||||
|
||||
# 农民日报各平台配置
|
||||
platforms = {
|
||||
'nmrb': {
|
||||
'name': '农民日报',
|
||||
'base_url': 'http://www.farmer.com.cn',
|
||||
'start_url': 'http://www.farmer.com.cn',
|
||||
'article_selector': 'a'
|
||||
},
|
||||
}
|
||||
|
||||
if platform == 'all':
|
||||
target_platforms = platforms.values()
|
||||
else:
|
||||
target_platforms = [platforms[platform]]
|
||||
|
||||
for platform_config in target_platforms:
|
||||
website, created = Website.objects.get_or_create(
|
||||
name=platform_config['name'],
|
||||
defaults={
|
||||
'base_url': platform_config['base_url'],
|
||||
'article_list_url': platform_config['start_url'],
|
||||
'article_selector': platform_config['article_selector']
|
||||
}
|
||||
)
|
||||
|
||||
# 确保更新已存在的网站对象的配置
|
||||
if not created:
|
||||
website.base_url = platform_config['base_url']
|
||||
website.article_list_url = platform_config['start_url']
|
||||
website.article_selector = platform_config['article_selector']
|
||||
website.save()
|
||||
|
||||
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
|
||||
full_site_crawler(platform_config['start_url'], website, max_pages=500)
|
||||
self.stdout.write(f"完成爬取: {platform_config['name']}")
|
||||
|
||||
self.stdout.write(self.style.SUCCESS("农民日报所有平台爬取完成"))
|
||||
53
core/management/commands/crawl_pla.py
Normal file
53
core/management/commands/crawl_pla.py
Normal file
@@ -0,0 +1,53 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
from core.models import Website
|
||||
from core.utils import full_site_crawler
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "全站递归爬取 解放军报平台"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
choices=['pla', 'all'],
|
||||
help='选择爬取平台: pla(解放军报), all(全部)')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
platform = options['platform']
|
||||
|
||||
# 解放军报各平台配置
|
||||
platforms = {
|
||||
'pla': {
|
||||
'name': '解放军报',
|
||||
'base_url': 'https://www.81.cn',
|
||||
'start_url': 'https://www.81.cn',
|
||||
'article_selector': 'a'
|
||||
},
|
||||
}
|
||||
|
||||
if platform == 'all':
|
||||
target_platforms = platforms.values()
|
||||
else:
|
||||
target_platforms = [platforms[platform]]
|
||||
|
||||
for platform_config in target_platforms:
|
||||
website, created = Website.objects.get_or_create(
|
||||
name=platform_config['name'],
|
||||
defaults={
|
||||
'base_url': platform_config['base_url'],
|
||||
'article_list_url': platform_config['start_url'],
|
||||
'article_selector': platform_config['article_selector']
|
||||
}
|
||||
)
|
||||
|
||||
# 确保更新已存在的网站对象的配置
|
||||
if not created:
|
||||
website.base_url = platform_config['base_url']
|
||||
website.article_list_url = platform_config['start_url']
|
||||
website.article_selector = platform_config['article_selector']
|
||||
website.save()
|
||||
|
||||
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
|
||||
full_site_crawler(platform_config['start_url'], website, max_pages=500)
|
||||
self.stdout.write(f"完成爬取: {platform_config['name']}")
|
||||
|
||||
self.stdout.write(self.style.SUCCESS("解放军报所有平台爬取完成"))
|
||||
53
core/management/commands/crawl_qiushi.py
Normal file
53
core/management/commands/crawl_qiushi.py
Normal file
@@ -0,0 +1,53 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
from core.models import Website
|
||||
from core.utils import full_site_crawler
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "全站递归爬取 求是杂志平台"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
choices=['qiushi', 'all'],
|
||||
help='选择爬取平台: qiushi(求是网), all(全部)')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
platform = options['platform']
|
||||
|
||||
# 求是杂志各平台配置
|
||||
platforms = {
|
||||
'qiushi': {
|
||||
'name': '求是网',
|
||||
'base_url': 'https://www.qstheory.cn',
|
||||
'start_url': 'https://www.qstheory.cn',
|
||||
'article_selector': 'a'
|
||||
},
|
||||
}
|
||||
|
||||
if platform == 'all':
|
||||
target_platforms = platforms.values()
|
||||
else:
|
||||
target_platforms = [platforms[platform]]
|
||||
|
||||
for platform_config in target_platforms:
|
||||
website, created = Website.objects.get_or_create(
|
||||
name=platform_config['name'],
|
||||
defaults={
|
||||
'base_url': platform_config['base_url'],
|
||||
'article_list_url': platform_config['start_url'],
|
||||
'article_selector': platform_config['article_selector']
|
||||
}
|
||||
)
|
||||
|
||||
# 确保更新已存在的网站对象的配置
|
||||
if not created:
|
||||
website.base_url = platform_config['base_url']
|
||||
website.article_list_url = platform_config['start_url']
|
||||
website.article_selector = platform_config['article_selector']
|
||||
website.save()
|
||||
|
||||
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
|
||||
full_site_crawler(platform_config['start_url'], website, max_pages=500)
|
||||
self.stdout.write(f"完成爬取: {platform_config['name']}")
|
||||
|
||||
self.stdout.write(self.style.SUCCESS("求是杂志所有平台爬取完成"))
|
||||
53
core/management/commands/crawl_qizhi.py
Normal file
53
core/management/commands/crawl_qizhi.py
Normal file
@@ -0,0 +1,53 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
from core.models import Website
|
||||
from core.utils import full_site_crawler
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "全站递归爬取 旗帜网平台"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
choices=['qizhi', 'all'],
|
||||
help='选择爬取平台: qizhi(旗帜网), all(全部)')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
platform = options['platform']
|
||||
|
||||
# 旗帜网各平台配置
|
||||
platforms = {
|
||||
'qizhi': {
|
||||
'name': '旗帜网',
|
||||
'base_url': 'http://www.qizhiwang.org.cn',
|
||||
'start_url': 'http://www.qizhiwang.org.cn',
|
||||
'article_selector': 'a[href^="/"]' # 修改选择器以更好地匹配文章链接
|
||||
},
|
||||
}
|
||||
|
||||
if platform == 'all':
|
||||
target_platforms = platforms.values()
|
||||
else:
|
||||
target_platforms = [platforms[platform]]
|
||||
|
||||
for platform_config in target_platforms:
|
||||
website, created = Website.objects.get_or_create(
|
||||
name=platform_config['name'],
|
||||
defaults={
|
||||
'base_url': platform_config['base_url'],
|
||||
'article_list_url': platform_config['start_url'],
|
||||
'article_selector': platform_config['article_selector']
|
||||
}
|
||||
)
|
||||
|
||||
# 确保更新已存在的网站对象的配置
|
||||
if not created:
|
||||
website.base_url = platform_config['base_url']
|
||||
website.article_list_url = platform_config['start_url']
|
||||
website.article_selector = platform_config['article_selector']
|
||||
website.save()
|
||||
|
||||
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
|
||||
full_site_crawler(platform_config['start_url'], website, max_pages=500)
|
||||
self.stdout.write(f"完成爬取: {platform_config['name']}")
|
||||
|
||||
self.stdout.write(self.style.SUCCESS("旗帜网所有平台爬取完成"))
|
||||
65
core/management/commands/crawl_rmrb.py
Normal file
65
core/management/commands/crawl_rmrb.py
Normal file
@@ -0,0 +1,65 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
from core.models import Website
|
||||
from core.utils import full_site_crawler
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "全站递归爬取 人民日报及其子网站、客户端、新媒体平台"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
choices=['peopleapp', 'people', 'paper', 'all'],
|
||||
help='选择爬取平台: peopleapp(客户端), people(人民网), paper(报纸), all(全部)')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
platform = options['platform']
|
||||
|
||||
# 人民日报各平台配置
|
||||
platforms = {
|
||||
'peopleapp': {
|
||||
'name': '人民日报客户端',
|
||||
'base_url': 'https://www.peopleapp.com',
|
||||
'start_url': 'https://www.peopleapp.com/home',
|
||||
'article_selector': 'a'
|
||||
},
|
||||
'people': {
|
||||
'name': '人民网',
|
||||
'base_url': 'https://www.people.com.cn',
|
||||
'start_url': 'https://www.people.com.cn',
|
||||
'article_selector': 'a'
|
||||
},
|
||||
'paper': {
|
||||
'name': '人民日报报纸',
|
||||
'base_url': 'http://paper.people.com.cn',
|
||||
'start_url': 'http://paper.people.com.cn',
|
||||
'article_selector': 'a'
|
||||
}
|
||||
}
|
||||
|
||||
if platform == 'all':
|
||||
target_platforms = platforms.values()
|
||||
else:
|
||||
target_platforms = [platforms[platform]]
|
||||
|
||||
for platform_config in target_platforms:
|
||||
website, created = Website.objects.get_or_create(
|
||||
name=platform_config['name'],
|
||||
defaults={
|
||||
'base_url': platform_config['base_url'],
|
||||
'article_list_url': platform_config['start_url'],
|
||||
'article_selector': platform_config['article_selector']
|
||||
}
|
||||
)
|
||||
|
||||
# 确保更新已存在的网站对象的配置
|
||||
if not created:
|
||||
website.base_url = platform_config['base_url']
|
||||
website.article_list_url = platform_config['start_url']
|
||||
website.article_selector = platform_config['article_selector']
|
||||
website.save()
|
||||
|
||||
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
|
||||
full_site_crawler(platform_config['start_url'], website, max_pages=500)
|
||||
self.stdout.write(f"完成爬取: {platform_config['name']}")
|
||||
|
||||
self.stdout.write(self.style.SUCCESS("人民日报所有平台爬取完成"))
|
||||
53
core/management/commands/crawl_rmzxb.py
Normal file
53
core/management/commands/crawl_rmzxb.py
Normal file
@@ -0,0 +1,53 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
from core.models import Website
|
||||
from core.utils import full_site_crawler
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "全站递归爬取 人民政协网平台"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
choices=['rmzxb', 'all'],
|
||||
help='选择爬取平台: rmzxb(人民政协网), all(全部)')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
platform = options['platform']
|
||||
|
||||
# 人民政协网各平台配置
|
||||
platforms = {
|
||||
'rmzxb': {
|
||||
'name': '人民政协网',
|
||||
'base_url': 'https://www.rmzxw.com.cn',
|
||||
'start_url': 'https://www.rmzxw.com.cn',
|
||||
'article_selector': 'a'
|
||||
},
|
||||
}
|
||||
|
||||
if platform == 'all':
|
||||
target_platforms = platforms.values()
|
||||
else:
|
||||
target_platforms = [platforms[platform]]
|
||||
|
||||
for platform_config in target_platforms:
|
||||
website, created = Website.objects.get_or_create(
|
||||
name=platform_config['name'],
|
||||
defaults={
|
||||
'base_url': platform_config['base_url'],
|
||||
'article_list_url': platform_config['start_url'],
|
||||
'article_selector': platform_config['article_selector']
|
||||
}
|
||||
)
|
||||
|
||||
# 确保更新已存在的网站对象的配置
|
||||
if not created:
|
||||
website.base_url = platform_config['base_url']
|
||||
website.article_list_url = platform_config['start_url']
|
||||
website.article_selector = platform_config['article_selector']
|
||||
website.save()
|
||||
|
||||
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
|
||||
full_site_crawler(platform_config['start_url'], website, max_pages=500)
|
||||
self.stdout.write(f"完成爬取: {platform_config['name']}")
|
||||
|
||||
self.stdout.write(self.style.SUCCESS("人民政协网所有平台爬取完成"))
|
||||
@@ -4,17 +4,51 @@ from core.utils import full_site_crawler
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "全站递归爬取 www.news.cn"
|
||||
help = "全站递归爬取 新华社平台"
|
||||
|
||||
def handle(self, *args, **kwargs):
|
||||
website, created = Website.objects.get_or_create(
|
||||
name="www.news.cn",
|
||||
defaults={
|
||||
'article_list_url': 'https://www.news.cn/',
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
choices=['news', 'all'],
|
||||
help='选择爬取平台: news(新华网), all(全部)')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
platform = options['platform']
|
||||
|
||||
# 新华社各平台配置
|
||||
platforms = {
|
||||
'news': {
|
||||
'name': '新华网',
|
||||
'base_url': 'https://www.news.cn',
|
||||
'start_url': 'https://www.news.cn',
|
||||
'article_selector': 'a'
|
||||
}
|
||||
)
|
||||
start_url = "https://www.news.cn/"
|
||||
self.stdout.write(f"开始全站爬取: {start_url}")
|
||||
full_site_crawler(start_url, website, max_pages=500)
|
||||
self.stdout.write("爬取完成")
|
||||
},
|
||||
|
||||
}
|
||||
|
||||
if platform == 'all':
|
||||
target_platforms = platforms.values()
|
||||
else:
|
||||
target_platforms = [platforms[platform]]
|
||||
|
||||
for platform_config in target_platforms:
|
||||
website, created = Website.objects.get_or_create(
|
||||
name=platform_config['name'],
|
||||
defaults={
|
||||
'base_url': platform_config['base_url'],
|
||||
'article_list_url': platform_config['start_url'],
|
||||
'article_selector': platform_config['article_selector']
|
||||
}
|
||||
)
|
||||
|
||||
# 确保更新已存在的网站对象的配置
|
||||
if not created:
|
||||
website.base_url = platform_config['base_url']
|
||||
website.article_list_url = platform_config['start_url']
|
||||
website.article_selector = platform_config['article_selector']
|
||||
website.save()
|
||||
|
||||
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
|
||||
full_site_crawler(platform_config['start_url'], website, max_pages=500)
|
||||
self.stdout.write(f"完成爬取: {platform_config['name']}")
|
||||
|
||||
self.stdout.write(self.style.SUCCESS("新华社所有平台爬取完成"))
|
||||
|
||||
@@ -1,21 +0,0 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
from core.models import Website
|
||||
from core.utils import crawl_xinhua_list
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = '批量爬取新华网文章'
|
||||
|
||||
def handle(self, *args, **options):
|
||||
# 添加使用标记,确认该命令是否被调用
|
||||
self.stdout.write(self.style.WARNING("crawl_xinhua command is being used"))
|
||||
|
||||
list_url = "https://www.news.cn/legal/index.html"
|
||||
try:
|
||||
website = Website.objects.get(base_url="https://www.news.cn/")
|
||||
except Website.DoesNotExist:
|
||||
self.stdout.write(self.style.ERROR("网站 https://www.news.cn/ 不存在,请先后台添加"))
|
||||
return
|
||||
|
||||
self.stdout.write(f"开始爬取文章列表页: {list_url}")
|
||||
crawl_xinhua_list(list_url, website)
|
||||
self.stdout.write(self.style.SUCCESS("批量爬取完成"))
|
||||
53
core/management/commands/crawl_xuexi.py
Normal file
53
core/management/commands/crawl_xuexi.py
Normal file
@@ -0,0 +1,53 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
from core.models import Website
|
||||
from core.utils import full_site_crawler
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "全站递归爬取 学习强国平台"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
choices=['xuexi', 'all'],
|
||||
help='选择爬取平台: xuexi(学习强国主站), all(全部)')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
platform = options['platform']
|
||||
|
||||
# 学习强国各平台配置
|
||||
platforms = {
|
||||
'xuexi': {
|
||||
'name': '学习强国',
|
||||
'base_url': 'https://www.xuexi.cn',
|
||||
'start_url': 'https://www.xuexi.cn',
|
||||
'article_selector': 'a'
|
||||
},
|
||||
}
|
||||
|
||||
if platform == 'all':
|
||||
target_platforms = platforms.values()
|
||||
else:
|
||||
target_platforms = [platforms[platform]]
|
||||
|
||||
for platform_config in target_platforms:
|
||||
website, created = Website.objects.get_or_create(
|
||||
name=platform_config['name'],
|
||||
defaults={
|
||||
'base_url': platform_config['base_url'],
|
||||
'article_list_url': platform_config['start_url'],
|
||||
'article_selector': platform_config['article_selector']
|
||||
}
|
||||
)
|
||||
|
||||
# 确保更新已存在的网站对象的配置
|
||||
if not created:
|
||||
website.base_url = platform_config['base_url']
|
||||
website.article_list_url = platform_config['start_url']
|
||||
website.article_selector = platform_config['article_selector']
|
||||
website.save()
|
||||
|
||||
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
|
||||
full_site_crawler(platform_config['start_url'], website, max_pages=500)
|
||||
self.stdout.write(f"完成爬取: {platform_config['name']}")
|
||||
|
||||
self.stdout.write(self.style.SUCCESS("学习强国所有平台爬取完成"))
|
||||
53
core/management/commands/crawl_xxsb.py
Normal file
53
core/management/commands/crawl_xxsb.py
Normal file
@@ -0,0 +1,53 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
from core.models import Website
|
||||
from core.utils import full_site_crawler
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "全站递归爬取 学习时报平台"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
choices=['xxsb', 'all'],
|
||||
help='选择爬取平台: xxsb(学习时报),all(全部)')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
platform = options['platform']
|
||||
|
||||
# 学习时报各平台配置
|
||||
platforms = {
|
||||
'xxsb': {
|
||||
'name': '学习时报',
|
||||
'base_url': 'http://www.studytimes.cn',
|
||||
'start_url': 'http://www.studytimes.cn',
|
||||
'article_selector': 'a'
|
||||
},
|
||||
}
|
||||
|
||||
if platform == 'all':
|
||||
target_platforms = platforms.values()
|
||||
else:
|
||||
target_platforms = [platforms[platform]]
|
||||
|
||||
for platform_config in target_platforms:
|
||||
website, created = Website.objects.get_or_create(
|
||||
name=platform_config['name'],
|
||||
defaults={
|
||||
'base_url': platform_config['base_url'],
|
||||
'article_list_url': platform_config['start_url'],
|
||||
'article_selector': platform_config['article_selector']
|
||||
}
|
||||
)
|
||||
|
||||
# 确保更新已存在的网站对象的配置
|
||||
if not created:
|
||||
website.base_url = platform_config['base_url']
|
||||
website.article_list_url = platform_config['start_url']
|
||||
website.article_selector = platform_config['article_selector']
|
||||
website.save()
|
||||
|
||||
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
|
||||
full_site_crawler(platform_config['start_url'], website, max_pages=500)
|
||||
self.stdout.write(f"完成爬取: {platform_config['name']}")
|
||||
|
||||
self.stdout.write(self.style.SUCCESS("学习时报所有平台爬取完成"))
|
||||
54
core/management/commands/crawl_zgfnb.py
Normal file
54
core/management/commands/crawl_zgfnb.py
Normal file
@@ -0,0 +1,54 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
from core.models import Website
|
||||
from core.utils import full_site_crawler
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "全站递归爬取 中国妇女报平台"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
choices=['zgfnb', 'all'],
|
||||
help='选择爬取平台: zgfnb(中国妇女报), all(全部)')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
platform = options['platform']
|
||||
|
||||
# 中国妇女报各平台配置
|
||||
platforms = {
|
||||
'zgfnb': {
|
||||
'name': '中国妇女报',
|
||||
'base_url': 'http://www.cnwomen.com.cn',
|
||||
'start_url': 'http://www.cnwomen.com.cn',
|
||||
'article_selector': 'a'
|
||||
},
|
||||
|
||||
}
|
||||
|
||||
if platform == 'all':
|
||||
target_platforms = platforms.values()
|
||||
else:
|
||||
target_platforms = [platforms[platform]]
|
||||
|
||||
for platform_config in target_platforms:
|
||||
website, created = Website.objects.get_or_create(
|
||||
name=platform_config['name'],
|
||||
defaults={
|
||||
'base_url': platform_config['base_url'],
|
||||
'article_list_url': platform_config['start_url'],
|
||||
'article_selector': platform_config['article_selector']
|
||||
}
|
||||
)
|
||||
|
||||
# 确保更新已存在的网站对象的配置
|
||||
if not created:
|
||||
website.base_url = platform_config['base_url']
|
||||
website.article_list_url = platform_config['start_url']
|
||||
website.article_selector = platform_config['article_selector']
|
||||
website.save()
|
||||
|
||||
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
|
||||
full_site_crawler(platform_config['start_url'], website, max_pages=500)
|
||||
self.stdout.write(f"完成爬取: {platform_config['name']}")
|
||||
|
||||
self.stdout.write(self.style.SUCCESS("中国妇女报所有平台爬取完成"))
|
||||
53
core/management/commands/crawl_zgjwjc.py
Normal file
53
core/management/commands/crawl_zgjwjc.py
Normal file
@@ -0,0 +1,53 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
from core.models import Website
|
||||
from core.utils import full_site_crawler
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "全站递归爬取 中国纪检监察报平台"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
choices=['zgjwjc', 'all'],
|
||||
help='选择爬取平台: zgjwjc(中国纪检监察报),all(全部)')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
platform = options['platform']
|
||||
|
||||
# 中国纪检监察报各平台配置
|
||||
platforms = {
|
||||
'zgjwjc': {
|
||||
'name': '中国纪检监察报',
|
||||
'base_url': 'https://jjjcb.ccdi.gov.cn',
|
||||
'start_url': 'https://jjjcb.ccdi.gov.cn',
|
||||
'article_selector': 'a'
|
||||
},
|
||||
}
|
||||
|
||||
if platform == 'all':
|
||||
target_platforms = platforms.values()
|
||||
else:
|
||||
target_platforms = [platforms[platform]]
|
||||
|
||||
for platform_config in target_platforms:
|
||||
website, created = Website.objects.get_or_create(
|
||||
name=platform_config['name'],
|
||||
defaults={
|
||||
'base_url': platform_config['base_url'],
|
||||
'article_list_url': platform_config['start_url'],
|
||||
'article_selector': platform_config['article_selector']
|
||||
}
|
||||
)
|
||||
|
||||
# 确保更新已存在的网站对象的配置
|
||||
if not created:
|
||||
website.base_url = platform_config['base_url']
|
||||
website.article_list_url = platform_config['start_url']
|
||||
website.article_selector = platform_config['article_selector']
|
||||
website.save()
|
||||
|
||||
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
|
||||
full_site_crawler(platform_config['start_url'], website, max_pages=500)
|
||||
self.stdout.write(f"完成爬取: {platform_config['name']}")
|
||||
|
||||
self.stdout.write(self.style.SUCCESS("中国纪检监察报所有平台爬取完成"))
|
||||
54
core/management/commands/crawl_zgqnb.py
Normal file
54
core/management/commands/crawl_zgqnb.py
Normal file
@@ -0,0 +1,54 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
from core.models import Website
|
||||
from core.utils import full_site_crawler
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "全站递归爬取 中国青年报平台"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
choices=['zgqnb', 'all'],
|
||||
help='选择爬取平台: zgqnb(中国青年报), all(全部)')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
platform = options['platform']
|
||||
|
||||
# 中国青年报各平台配置
|
||||
platforms = {
|
||||
'zgqnb': {
|
||||
'name': '中国青年报',
|
||||
'base_url': 'https://www.cyol.com',
|
||||
'start_url': 'https://www.cyol.com',
|
||||
'article_selector': 'a'
|
||||
},
|
||||
|
||||
}
|
||||
|
||||
if platform == 'all':
|
||||
target_platforms = platforms.values()
|
||||
else:
|
||||
target_platforms = [platforms[platform]]
|
||||
|
||||
for platform_config in target_platforms:
|
||||
website, created = Website.objects.get_or_create(
|
||||
name=platform_config['name'],
|
||||
defaults={
|
||||
'base_url': platform_config['base_url'],
|
||||
'article_list_url': platform_config['start_url'],
|
||||
'article_selector': platform_config['article_selector']
|
||||
}
|
||||
)
|
||||
|
||||
# 确保更新已存在的网站对象的配置
|
||||
if not created:
|
||||
website.base_url = platform_config['base_url']
|
||||
website.article_list_url = platform_config['start_url']
|
||||
website.article_selector = platform_config['article_selector']
|
||||
website.save()
|
||||
|
||||
self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}")
|
||||
full_site_crawler(platform_config['start_url'], website, max_pages=500)
|
||||
self.stdout.write(f"完成爬取: {platform_config['name']}")
|
||||
|
||||
self.stdout.write(self.style.SUCCESS("中国青年报所有平台爬取完成"))
|
||||
@@ -4,25 +4,32 @@ import json
|
||||
import csv
|
||||
import os
|
||||
from django.conf import settings
|
||||
from django.core.files.storage import default_storage
|
||||
import zipfile
|
||||
from django.utils import timezone
|
||||
from bs4 import BeautifulSoup
|
||||
# 添加python-docx库支持
|
||||
import io
|
||||
from docx import Document
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = '导出文章及相关的媒体文件(图片、视频等)'
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--format', type=str, default='json', help='导出格式: json 或 csv')
|
||||
parser.add_argument('--format', type=str, default='docx', help='导出格式: json、csv 或 docx')
|
||||
parser.add_argument('--website', type=str, help='指定网站名称导出特定网站的文章')
|
||||
parser.add_argument('--output', type=str, default='', help='输出文件路径')
|
||||
parser.add_argument('--include-media', action='store_true', help='包含媒体文件')
|
||||
# 修改默认值为True,使包含媒体文件成为默认行为
|
||||
parser.add_argument('--include-media', action='store_true', default=True, help='包含媒体文件')
|
||||
# 添加参数控制是否打包成zip
|
||||
parser.add_argument('--no-zip', action='store_true', help='不打包成zip文件')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
format_type = options['format'].lower()
|
||||
website_name = options['website']
|
||||
output_path = options['output']
|
||||
include_media = options['include_media']
|
||||
no_zip = options['no_zip']
|
||||
|
||||
# 获取文章查询集
|
||||
articles = Article.objects.all()
|
||||
@@ -65,20 +72,26 @@ class Command(BaseCommand):
|
||||
# 确定输出路径
|
||||
if not output_path:
|
||||
timestamp = timezone.now().strftime('%Y%m%d_%H%M%S')
|
||||
if include_media:
|
||||
output_path = f'articles_export_{timestamp}.zip'
|
||||
else:
|
||||
output_path = f'articles_export_{timestamp}.{format_type}'
|
||||
# 默认导出为zip格式
|
||||
output_path = f'articles_export_{timestamp}.zip'
|
||||
|
||||
# 执行导出
|
||||
if include_media:
|
||||
self.export_with_media(articles_data, media_files, output_path, format_type)
|
||||
# 如果需要包含媒体文件或格式为docx,则默认打包成zip
|
||||
if include_media or format_type == 'docx':
|
||||
if no_zip:
|
||||
if format_type == 'docx':
|
||||
self.export_as_word(articles_data, output_path)
|
||||
elif format_type == 'json':
|
||||
self.export_as_json(articles_data, output_path)
|
||||
elif format_type == 'csv':
|
||||
self.export_as_csv(articles_data, output_path)
|
||||
else:
|
||||
self.export_with_media(articles_data, media_files, output_path, format_type)
|
||||
else:
|
||||
if format_type == 'json':
|
||||
self.export_as_json(articles_data, output_path)
|
||||
elif format_type == 'csv':
|
||||
self.export_as_csv(articles_data, output_path)
|
||||
# 添加Word格式导出支持
|
||||
elif format_type == 'docx':
|
||||
self.export_as_word(articles_data, output_path)
|
||||
else:
|
||||
@@ -110,202 +123,100 @@ class Command(BaseCommand):
|
||||
# 添加Word格式导出方法
|
||||
def export_as_word(self, articles_data, output_path):
|
||||
try:
|
||||
from docx import Document
|
||||
from docx.shared import Inches
|
||||
except ImportError:
|
||||
self.stdout.write(self.style.ERROR('缺少python-docx库,请安装: pip install python-docx'))
|
||||
return
|
||||
|
||||
# 创建Word文档
|
||||
doc = Document()
|
||||
doc.add_heading('文章导出', 0)
|
||||
|
||||
for article_data in articles_data:
|
||||
# 添加文章标题
|
||||
doc.add_heading(article_data['title'], level=1)
|
||||
|
||||
# 添加文章元数据
|
||||
doc.add_paragraph(f"网站: {article_data['website']}")
|
||||
doc.add_paragraph(f"URL: {article_data['url']}")
|
||||
doc.add_paragraph(f"发布时间: {article_data['pub_date']}")
|
||||
doc.add_paragraph(f"创建时间: {article_data['created_at']}")
|
||||
|
||||
# 添加文章内容
|
||||
doc.add_heading('内容', level=2)
|
||||
# 简单处理HTML内容,移除标签
|
||||
from bs4 import BeautifulSoup
|
||||
soup = BeautifulSoup(article_data['content'], 'html.parser')
|
||||
|
||||
# 处理内容中的图片
|
||||
for img in soup.find_all('img'):
|
||||
src = img.get('src', '')
|
||||
if src:
|
||||
# 尝试添加图片到文档
|
||||
try:
|
||||
import os
|
||||
from django.conf import settings
|
||||
import requests
|
||||
from io import BytesIO
|
||||
|
||||
# 构建完整的图片路径
|
||||
if src.startswith('http'):
|
||||
# 网络图片
|
||||
response = requests.get(src, timeout=10)
|
||||
image_stream = BytesIO(response.content)
|
||||
doc.add_picture(image_stream, width=Inches(4.0))
|
||||
else:
|
||||
# 本地图片
|
||||
full_path = os.path.join(settings.MEDIA_ROOT, src.lstrip('/'))
|
||||
if os.path.exists(full_path):
|
||||
doc.add_picture(full_path, width=Inches(4.0))
|
||||
except Exception as e:
|
||||
# 如果添加图片失败,添加图片URL作为文本
|
||||
doc.add_paragraph(f"[图片: {src}]")
|
||||
|
||||
# 移除原始img标签
|
||||
img.decompose()
|
||||
|
||||
content_text = soup.get_text()
|
||||
doc.add_paragraph(content_text)
|
||||
|
||||
# 添加媒体文件信息
|
||||
if article_data['media_files']:
|
||||
doc.add_heading('媒体文件', level=2)
|
||||
for media_file in article_data['media_files']:
|
||||
try:
|
||||
import os
|
||||
from django.conf import settings
|
||||
from io import BytesIO
|
||||
import requests
|
||||
|
||||
full_path = os.path.join(settings.MEDIA_ROOT, media_file)
|
||||
if os.path.exists(full_path):
|
||||
# 添加图片到文档
|
||||
doc.add_picture(full_path, width=Inches(4.0))
|
||||
else:
|
||||
# 如果是URL格式的媒体文件
|
||||
if media_file.startswith('http'):
|
||||
response = requests.get(media_file, timeout=10)
|
||||
image_stream = BytesIO(response.content)
|
||||
doc.add_picture(image_stream, width=Inches(4.0))
|
||||
else:
|
||||
doc.add_paragraph(media_file)
|
||||
except Exception as e:
|
||||
doc.add_paragraph(media_file)
|
||||
|
||||
# 添加分页符
|
||||
doc.add_page_break()
|
||||
|
||||
# 保存文档
|
||||
doc.save(output_path)
|
||||
# 创建一个新的Word文档
|
||||
document = Document()
|
||||
document.add_heading('文章导出', 0)
|
||||
|
||||
for article_data in articles_data:
|
||||
# 添加文章标题
|
||||
document.add_heading(article_data['title'], level=1)
|
||||
|
||||
# 添加文章信息
|
||||
document.add_paragraph(f"网站: {article_data['website']}")
|
||||
document.add_paragraph(f"URL: {article_data['url']}")
|
||||
document.add_paragraph(f"发布时间: {article_data['pub_date']}")
|
||||
document.add_paragraph(f"创建时间: {article_data['created_at']}")
|
||||
|
||||
# 添加内容标题
|
||||
document.add_heading('内容:', level=2)
|
||||
|
||||
# 处理HTML内容,移除标签
|
||||
soup = BeautifulSoup(article_data['content'], 'html.parser')
|
||||
content_text = soup.get_text()
|
||||
document.add_paragraph(content_text)
|
||||
|
||||
# 添加分页符分隔文章
|
||||
document.add_page_break()
|
||||
|
||||
# 保存文档
|
||||
document.save(output_path)
|
||||
self.stdout.write(self.style.SUCCESS(f'成功导出为Word格式: {output_path}'))
|
||||
except Exception as e:
|
||||
self.stdout.write(self.style.ERROR(f'导出Word格式失败: {e}'))
|
||||
|
||||
def export_with_media(self, articles_data, media_files, output_path, format_type):
|
||||
# 创建ZIP文件
|
||||
with zipfile.ZipFile(output_path, 'w') as zipf:
|
||||
# 添加文章数据文件
|
||||
data_filename = f'articles.{format_type}'
|
||||
if format_type == 'json':
|
||||
json_data = json.dumps(articles_data, ensure_ascii=False, indent=2)
|
||||
zipf.writestr(data_filename, json_data)
|
||||
elif format_type == 'csv':
|
||||
# 创建CSV内容
|
||||
if articles_data:
|
||||
import io
|
||||
csv_buffer = io.StringIO()
|
||||
fieldnames = ['id', 'title', 'website', 'url', 'pub_date', 'content', 'created_at', 'media_files']
|
||||
writer = csv.DictWriter(csv_buffer, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
for article_data in articles_data:
|
||||
article_data['media_files'] = ';'.join(article_data['media_files']) if article_data[
|
||||
'media_files'] else ''
|
||||
writer.writerow(article_data)
|
||||
zipf.writestr(data_filename, csv_buffer.getvalue())
|
||||
# 添加Word格式支持
|
||||
elif format_type == 'docx':
|
||||
# 创建Word文档并保存到ZIP
|
||||
try:
|
||||
from docx import Document
|
||||
from docx.shared import Inches
|
||||
from io import BytesIO
|
||||
|
||||
doc = Document()
|
||||
doc.add_heading('文章导出', 0)
|
||||
|
||||
for article_data in articles_data:
|
||||
doc.add_heading(article_data['title'], level=1)
|
||||
# 为每篇文章创建独立的文件夹
|
||||
for article_data in articles_data:
|
||||
article_folder = f"article_{article_data['id']}_{article_data['title']}"
|
||||
# 限制文件夹名称长度并移除非法字符
|
||||
article_folder = article_folder[:50].rstrip()
|
||||
article_folder = "".join(c for c in article_folder if c.isalnum() or c in (' ','_','-')).rstrip()
|
||||
|
||||
# 添加文章数据文件
|
||||
if format_type == 'docx':
|
||||
# 创建Word文档并保存到ZIP
|
||||
data_filename = f'{article_folder}/article.docx'
|
||||
try:
|
||||
# 创建文章信息Word文档
|
||||
doc = Document()
|
||||
doc.add_heading(article_data['title'], 0)
|
||||
|
||||
# 添加文章信息
|
||||
doc.add_paragraph(f"网站: {article_data['website']}")
|
||||
doc.add_paragraph(f"URL: {article_data['url']}")
|
||||
doc.add_paragraph(f"发布时间: {article_data['pub_date']}")
|
||||
doc.add_paragraph(f"创建时间: {article_data['created_at']}")
|
||||
|
||||
doc.add_heading('内容', level=2)
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# 添加内容标题
|
||||
doc.add_heading('内容:', level=1)
|
||||
|
||||
# 处理HTML内容
|
||||
soup = BeautifulSoup(article_data['content'], 'html.parser')
|
||||
|
||||
# 处理内容中的图片
|
||||
for img in soup.find_all('img'):
|
||||
src = img.get('src', '')
|
||||
if src:
|
||||
# 尝试添加图片到文档
|
||||
try:
|
||||
import os
|
||||
from django.conf import settings
|
||||
import requests
|
||||
|
||||
# 构建完整的图片路径
|
||||
if src.startswith('http'):
|
||||
# 网络图片
|
||||
response = requests.get(src, timeout=10)
|
||||
image_stream = BytesIO(response.content)
|
||||
doc.add_picture(image_stream, width=Inches(4.0))
|
||||
else:
|
||||
# 本地图片
|
||||
full_path = os.path.join(settings.MEDIA_ROOT, src.lstrip('/'))
|
||||
if os.path.exists(full_path):
|
||||
doc.add_picture(full_path, width=Inches(4.0))
|
||||
except Exception as e:
|
||||
# 如果添加图片失败,添加图片URL作为文本
|
||||
doc.add_paragraph(f"[图片: {src}]")
|
||||
|
||||
# 移除原始img标签
|
||||
img.decompose()
|
||||
|
||||
content_text = soup.get_text()
|
||||
doc.add_paragraph(content_text)
|
||||
|
||||
if article_data['media_files']:
|
||||
doc.add_heading('媒体文件', level=2)
|
||||
for media_file in article_data['media_files']:
|
||||
try:
|
||||
import os
|
||||
from django.conf import settings
|
||||
|
||||
full_path = os.path.join(settings.MEDIA_ROOT, media_file)
|
||||
if os.path.exists(full_path):
|
||||
# 添加图片到文档
|
||||
doc.add_picture(full_path, width=Inches(4.0))
|
||||
else:
|
||||
# 如果是URL格式的媒体文件
|
||||
if media_file.startswith('http'):
|
||||
response = requests.get(media_file, timeout=10)
|
||||
image_stream = BytesIO(response.content)
|
||||
doc.add_picture(image_stream, width=Inches(4.0))
|
||||
else:
|
||||
doc.add_paragraph(media_file)
|
||||
except Exception as e:
|
||||
doc.add_paragraph(media_file)
|
||||
|
||||
doc.add_page_break()
|
||||
|
||||
# 将文档保存到内存中再写入ZIP
|
||||
doc_buffer = BytesIO()
|
||||
doc.save(doc_buffer)
|
||||
doc_buffer.seek(0)
|
||||
zipf.writestr(data_filename, doc_buffer.read())
|
||||
except ImportError:
|
||||
zipf.writestr(data_filename, "错误:缺少python-docx库,无法生成Word文档")
|
||||
|
||||
# 添加媒体文件
|
||||
for media_path in media_files:
|
||||
arcname = os.path.join('media', os.path.relpath(media_path, settings.MEDIA_ROOT))
|
||||
zipf.write(media_path, arcname)
|
||||
|
||||
# 将文档保存到内存中
|
||||
doc_buffer = io.BytesIO()
|
||||
doc.save(doc_buffer)
|
||||
doc_buffer.seek(0)
|
||||
|
||||
# 将文档添加到ZIP文件
|
||||
zipf.writestr(data_filename, doc_buffer.getvalue())
|
||||
except Exception as e:
|
||||
error_msg = f"错误:无法生成文章Word文档 - {str(e)}"
|
||||
zipf.writestr(data_filename, error_msg)
|
||||
|
||||
# 添加媒体文件到文章的media子文件夹
|
||||
if article_data['media_files']:
|
||||
for media_file in article_data['media_files']:
|
||||
try:
|
||||
full_path = os.path.join(settings.MEDIA_ROOT, media_file)
|
||||
if os.path.exists(full_path):
|
||||
# 添加媒体文件到ZIP中的media子文件夹
|
||||
media_filename = f"{article_folder}/media/{os.path.basename(media_file)}"
|
||||
zipf.write(full_path, media_filename)
|
||||
else:
|
||||
# 如果是URL格式的媒体文件
|
||||
if media_file.startswith('http'):
|
||||
import requests
|
||||
from io import BytesIO
|
||||
|
||||
response = requests.get(media_file, timeout=10)
|
||||
image_stream = BytesIO(response.content)
|
||||
media_filename = f"{article_folder}/media/{os.path.basename(media_file)}"
|
||||
zipf.writestr(media_filename, image_stream.getvalue())
|
||||
except Exception as e:
|
||||
# 错误处理,跳过无法添加的文件
|
||||
pass
|
||||
45
core/migrations/0002_crawltask.py
Normal file
45
core/migrations/0002_crawltask.py
Normal file
@@ -0,0 +1,45 @@
|
||||
# Generated by Django 5.1 on 2025-09-23 19:28
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0001_initial'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='CrawlTask',
|
||||
fields=[
|
||||
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('name', models.CharField(max_length=200, verbose_name='任务名称')),
|
||||
('task_type', models.CharField(choices=[('keyword', '关键词搜索'), ('historical', '历史文章'), ('full_site', '全站爬取')], default='keyword', max_length=20, verbose_name='任务类型')),
|
||||
('keyword', models.CharField(blank=True, max_length=200, null=True, verbose_name='搜索关键词')),
|
||||
('websites', models.JSONField(default=list, verbose_name='目标网站')),
|
||||
('start_date', models.DateField(blank=True, null=True, verbose_name='开始日期')),
|
||||
('end_date', models.DateField(blank=True, null=True, verbose_name='结束日期')),
|
||||
('max_pages', models.IntegerField(default=10, verbose_name='最大页数')),
|
||||
('max_articles', models.IntegerField(default=100, verbose_name='最大文章数')),
|
||||
('status', models.CharField(choices=[('pending', '等待中'), ('running', '运行中'), ('completed', '已完成'), ('failed', '失败'), ('cancelled', '已取消')], default='pending', max_length=20, verbose_name='状态')),
|
||||
('progress', models.IntegerField(default=0, verbose_name='进度百分比')),
|
||||
('current_website', models.CharField(blank=True, max_length=100, null=True, verbose_name='当前网站')),
|
||||
('current_action', models.CharField(blank=True, max_length=200, null=True, verbose_name='当前操作')),
|
||||
('total_articles', models.IntegerField(default=0, verbose_name='总文章数')),
|
||||
('success_count', models.IntegerField(default=0, verbose_name='成功数')),
|
||||
('failed_count', models.IntegerField(default=0, verbose_name='失败数')),
|
||||
('created_at', models.DateTimeField(auto_now_add=True, verbose_name='创建时间')),
|
||||
('started_at', models.DateTimeField(blank=True, null=True, verbose_name='开始时间')),
|
||||
('completed_at', models.DateTimeField(blank=True, null=True, verbose_name='完成时间')),
|
||||
('error_message', models.TextField(blank=True, null=True, verbose_name='错误信息')),
|
||||
('result_details', models.JSONField(blank=True, default=dict, verbose_name='结果详情')),
|
||||
('created_by', models.CharField(blank=True, max_length=100, null=True, verbose_name='创建者')),
|
||||
],
|
||||
options={
|
||||
'verbose_name': '爬取任务',
|
||||
'verbose_name_plural': '爬取任务',
|
||||
'ordering': ['-created_at'],
|
||||
},
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,22 @@
|
||||
# Generated by Django 5.1 on 2025-09-23 19:34
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0002_crawltask'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RemoveField(
|
||||
model_name='crawltask',
|
||||
name='websites',
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='crawltask',
|
||||
name='websites',
|
||||
field=models.ManyToManyField(blank=True, to='core.website', verbose_name='目标网站'),
|
||||
),
|
||||
]
|
||||
28
core/migrations/0004_crawltask_execution_count_and_more.py
Normal file
28
core/migrations/0004_crawltask_execution_count_and_more.py
Normal file
@@ -0,0 +1,28 @@
|
||||
# Generated by Django 5.1 on 2025-09-25 02:16
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0003_remove_crawltask_websites_crawltask_websites'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name='crawltask',
|
||||
name='execution_count',
|
||||
field=models.IntegerField(default=0, verbose_name='执行次数'),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='crawltask',
|
||||
name='execution_history',
|
||||
field=models.JSONField(blank=True, default=list, verbose_name='执行历史'),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='crawltask',
|
||||
name='last_execution_at',
|
||||
field=models.DateTimeField(blank=True, null=True, verbose_name='最后执行时间'),
|
||||
),
|
||||
]
|
||||
146
core/models.py
146
core/models.py
@@ -1,4 +1,6 @@
|
||||
from django.db import models
|
||||
from django.utils import timezone
|
||||
import json
|
||||
|
||||
|
||||
class Website(models.Model):
|
||||
@@ -25,3 +27,147 @@ class Article(models.Model):
|
||||
|
||||
def __str__(self):
|
||||
return self.title
|
||||
|
||||
|
||||
class CrawlTask(models.Model):
|
||||
"""爬取任务模型"""
|
||||
TASK_STATUS_CHOICES = [
|
||||
('pending', '等待中'),
|
||||
('running', '运行中'),
|
||||
('completed', '已完成'),
|
||||
('failed', '失败'),
|
||||
('cancelled', '已取消'),
|
||||
]
|
||||
|
||||
TASK_TYPE_CHOICES = [
|
||||
('keyword', '关键词搜索'),
|
||||
('historical', '历史文章'),
|
||||
('full_site', '全站爬取'),
|
||||
]
|
||||
|
||||
name = models.CharField(max_length=200, verbose_name="任务名称")
|
||||
task_type = models.CharField(max_length=20, choices=TASK_TYPE_CHOICES, default='keyword', verbose_name="任务类型")
|
||||
keyword = models.CharField(max_length=200, blank=True, null=True, verbose_name="搜索关键词")
|
||||
websites = models.ManyToManyField(Website, blank=True, verbose_name="目标网站")
|
||||
start_date = models.DateField(blank=True, null=True, verbose_name="开始日期")
|
||||
end_date = models.DateField(blank=True, null=True, verbose_name="结束日期")
|
||||
max_pages = models.IntegerField(default=10, verbose_name="最大页数")
|
||||
max_articles = models.IntegerField(default=100, verbose_name="最大文章数")
|
||||
|
||||
status = models.CharField(max_length=20, choices=TASK_STATUS_CHOICES, default='pending', verbose_name="状态")
|
||||
progress = models.IntegerField(default=0, verbose_name="进度百分比")
|
||||
current_website = models.CharField(max_length=100, blank=True, null=True, verbose_name="当前网站")
|
||||
current_action = models.CharField(max_length=200, blank=True, null=True, verbose_name="当前操作")
|
||||
|
||||
total_articles = models.IntegerField(default=0, verbose_name="总文章数")
|
||||
success_count = models.IntegerField(default=0, verbose_name="成功数")
|
||||
failed_count = models.IntegerField(default=0, verbose_name="失败数")
|
||||
|
||||
created_at = models.DateTimeField(auto_now_add=True, verbose_name="创建时间")
|
||||
started_at = models.DateTimeField(blank=True, null=True, verbose_name="开始时间")
|
||||
completed_at = models.DateTimeField(blank=True, null=True, verbose_name="完成时间")
|
||||
|
||||
error_message = models.TextField(blank=True, null=True, verbose_name="错误信息")
|
||||
result_details = models.JSONField(default=dict, blank=True, verbose_name="结果详情")
|
||||
|
||||
created_by = models.CharField(max_length=100, blank=True, null=True, verbose_name="创建者")
|
||||
|
||||
# 执行历史字段
|
||||
execution_count = models.IntegerField(default=0, verbose_name="执行次数")
|
||||
last_execution_at = models.DateTimeField(blank=True, null=True, verbose_name="最后执行时间")
|
||||
execution_history = models.JSONField(default=list, blank=True, verbose_name="执行历史")
|
||||
|
||||
class Meta:
|
||||
verbose_name = "爬取任务"
|
||||
verbose_name_plural = "爬取任务"
|
||||
ordering = ['-created_at']
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.name} ({self.get_status_display()})"
|
||||
|
||||
def get_websites_display(self):
|
||||
"""获取网站列表的显示文本"""
|
||||
try:
|
||||
websites = self.websites.all()
|
||||
if not websites:
|
||||
return "所有网站"
|
||||
# 确保网站名称是字符串并可以被join处理
|
||||
website_names = [str(w.name) for w in websites if w.name]
|
||||
return ", ".join(website_names) if website_names else "所有网站"
|
||||
except Exception:
|
||||
# 如果出现任何异常,返回默认值
|
||||
return "所有网站"
|
||||
|
||||
def get_duration(self):
|
||||
"""获取任务执行时长"""
|
||||
if not self.started_at:
|
||||
return None
|
||||
end_time = self.completed_at or timezone.now()
|
||||
return end_time - self.started_at
|
||||
|
||||
def is_running(self):
|
||||
"""判断任务是否正在运行"""
|
||||
return self.status == 'running'
|
||||
|
||||
def can_cancel(self):
|
||||
"""判断任务是否可以取消"""
|
||||
return self.status in ['pending', 'running']
|
||||
|
||||
def get_progress_display(self):
|
||||
"""获取进度显示文本"""
|
||||
if self.status == 'pending':
|
||||
return "等待开始"
|
||||
elif self.status == 'running':
|
||||
if self.current_website and self.current_action:
|
||||
return f"正在处理 {self.current_website}: {self.current_action}"
|
||||
return f"运行中 ({self.progress}%)"
|
||||
elif self.status == 'completed':
|
||||
return f"已完成 ({self.success_count}/{self.total_articles})"
|
||||
elif self.status == 'failed':
|
||||
return f"失败: {self.error_message[:50]}..." if self.error_message else "失败"
|
||||
elif self.status == 'cancelled':
|
||||
return "已取消"
|
||||
return "未知状态"
|
||||
|
||||
def add_execution_record(self, status, started_at=None, completed_at=None, error_message=None):
|
||||
"""添加执行记录"""
|
||||
if not started_at:
|
||||
started_at = timezone.now()
|
||||
|
||||
execution_record = {
|
||||
'execution_id': len(self.execution_history) + 1,
|
||||
'started_at': started_at.isoformat() if started_at else None,
|
||||
'completed_at': completed_at.isoformat() if completed_at else None,
|
||||
'status': status,
|
||||
'error_message': error_message,
|
||||
'success_count': self.success_count,
|
||||
'failed_count': self.failed_count,
|
||||
'total_articles': self.total_articles
|
||||
}
|
||||
|
||||
# 更新执行历史
|
||||
if not self.execution_history:
|
||||
self.execution_history = []
|
||||
|
||||
self.execution_history.append(execution_record)
|
||||
|
||||
# 更新执行次数和最后执行时间
|
||||
self.execution_count += 1
|
||||
self.last_execution_at = started_at
|
||||
|
||||
# 只保留最近10次执行记录
|
||||
if len(self.execution_history) > 10:
|
||||
self.execution_history = self.execution_history[-10:]
|
||||
|
||||
self.save()
|
||||
|
||||
def get_execution_summary(self):
|
||||
"""获取执行摘要"""
|
||||
if not self.execution_history:
|
||||
return "暂无执行记录"
|
||||
|
||||
total_executions = len(self.execution_history)
|
||||
successful_executions = len([r for r in self.execution_history if r['status'] == 'completed'])
|
||||
failed_executions = len([r for r in self.execution_history if r['status'] == 'failed'])
|
||||
|
||||
return f"执行 {total_executions} 次,成功 {successful_executions} 次,失败 {failed_executions} 次"
|
||||
123
core/static/admin/js/crawl_task_actions.js
Normal file
123
core/static/admin/js/crawl_task_actions.js
Normal file
@@ -0,0 +1,123 @@
|
||||
/**
|
||||
* 爬取任务操作JavaScript
|
||||
*/
|
||||
|
||||
function startTask(taskId) {
|
||||
if (confirm('确定要启动这个任务吗?')) {
|
||||
fetch(`/admin/core/crawltask/${taskId}/start/`, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'X-CSRFToken': getCookie('csrftoken'),
|
||||
'Content-Type': 'application/x-www-form-urlencoded',
|
||||
},
|
||||
})
|
||||
.then(response => {
|
||||
if (response.ok) {
|
||||
location.reload();
|
||||
} else {
|
||||
alert('启动任务失败');
|
||||
}
|
||||
})
|
||||
.catch(error => {
|
||||
console.error('Error:', error);
|
||||
alert('启动任务失败');
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
function cancelTask(taskId) {
|
||||
if (confirm('确定要取消这个任务吗?')) {
|
||||
fetch(`/admin/core/crawltask/${taskId}/cancel/`, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'X-CSRFToken': getCookie('csrftoken'),
|
||||
'Content-Type': 'application/x-www-form-urlencoded',
|
||||
},
|
||||
})
|
||||
.then(response => {
|
||||
if (response.ok) {
|
||||
// 显示取消中的提示
|
||||
const cancelButton = document.querySelector(`a[href="javascript:void(0)"][onclick="cancelTask(${taskId})"]`);
|
||||
if (cancelButton) {
|
||||
cancelButton.textContent = '取消中...';
|
||||
cancelButton.style.pointerEvents = 'none';
|
||||
cancelButton.style.opacity = '0.5';
|
||||
}
|
||||
// 5秒后刷新页面以查看状态更新
|
||||
setTimeout(() => location.reload(), 2000);
|
||||
} else {
|
||||
alert('取消任务失败');
|
||||
}
|
||||
})
|
||||
.catch(error => {
|
||||
console.error('Error:', error);
|
||||
alert('取消任务失败');
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
function rerunTask(taskId) {
|
||||
if (confirm('确定要重新执行这个任务吗?这将重置任务状态并重新开始爬取。')) {
|
||||
fetch(`/admin/core/crawltask/${taskId}/rerun/`, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'X-CSRFToken': getCookie('csrftoken'),
|
||||
'Content-Type': 'application/x-www-form-urlencoded',
|
||||
},
|
||||
})
|
||||
.then(response => {
|
||||
if (response.ok) {
|
||||
// 显示重新执行中的提示
|
||||
const rerunButton = document.querySelector(`a[href="javascript:void(0)"][onclick="rerunTask(${taskId})"]`);
|
||||
if (rerunButton) {
|
||||
rerunButton.textContent = '重新执行中...';
|
||||
rerunButton.style.pointerEvents = 'none';
|
||||
rerunButton.style.opacity = '0.5';
|
||||
}
|
||||
// 2秒后刷新页面以查看状态更新
|
||||
setTimeout(() => location.reload(), 2000);
|
||||
} else {
|
||||
alert('重新执行任务失败');
|
||||
}
|
||||
})
|
||||
.catch(error => {
|
||||
console.error('Error:', error);
|
||||
alert('重新执行任务失败');
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
function viewResults(taskId) {
|
||||
window.open(`/admin/core/crawltask/${taskId}/results/`, '_blank');
|
||||
}
|
||||
|
||||
function getCookie(name) {
|
||||
let cookieValue = null;
|
||||
if (document.cookie && document.cookie !== '') {
|
||||
const cookies = document.cookie.split(';');
|
||||
for (let i = 0; i < cookies.length; i++) {
|
||||
const cookie = cookies[i].trim();
|
||||
if (cookie.substring(0, name.length + 1) === (name + '=')) {
|
||||
cookieValue = decodeURIComponent(cookie.substring(name.length + 1));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return cookieValue;
|
||||
}
|
||||
|
||||
// 自动刷新运行中的任务状态
|
||||
function autoRefreshRunningTasks() {
|
||||
const runningTasks = document.querySelectorAll('[data-task-status="running"]');
|
||||
if (runningTasks.length > 0) {
|
||||
// 每30秒刷新一次页面
|
||||
setTimeout(() => {
|
||||
location.reload();
|
||||
}, 30000);
|
||||
}
|
||||
}
|
||||
|
||||
// 页面加载完成后执行
|
||||
document.addEventListener('DOMContentLoaded', function() {
|
||||
autoRefreshRunningTasks();
|
||||
});
|
||||
474
core/task_executor.py
Normal file
474
core/task_executor.py
Normal file
@@ -0,0 +1,474 @@
|
||||
"""
|
||||
爬取任务执行器
|
||||
负责执行爬取任务并更新任务状态
|
||||
"""
|
||||
|
||||
import threading
|
||||
import time
|
||||
from django.utils import timezone
|
||||
from django.db import transaction
|
||||
from core.models import CrawlTask
|
||||
from core.utils import crawl_by_keyword, crawl_historical_articles, full_site_crawler, WEBSITE_CRAWL_CONFIGS
|
||||
|
||||
|
||||
class TaskExecutor:
|
||||
"""任务执行器"""
|
||||
|
||||
def __init__(self):
|
||||
self.running_tasks = {}
|
||||
self.cancelled_tasks = set() # 添加已取消任务的集合
|
||||
self.lock = threading.Lock()
|
||||
|
||||
def start_task(self, task_id, rerun=False):
|
||||
"""启动任务"""
|
||||
with self.lock:
|
||||
if task_id in self.running_tasks:
|
||||
return False, "任务已在运行中"
|
||||
|
||||
try:
|
||||
task = CrawlTask.objects.get(id=task_id)
|
||||
|
||||
# 检查任务状态
|
||||
if not rerun and task.status != 'pending':
|
||||
return False, "任务状态不允许启动"
|
||||
|
||||
# 如果是重新执行,检查任务是否已完成或失败
|
||||
if rerun and task.status not in ['completed', 'failed', 'cancelled']:
|
||||
return False, "只有已完成、失败或已取消的任务可以重新执行"
|
||||
|
||||
# 重置任务状态(如果是重新执行)
|
||||
if rerun:
|
||||
task.status = 'running'
|
||||
task.started_at = timezone.now()
|
||||
task.completed_at = None
|
||||
task.error_message = None
|
||||
task.progress = 0
|
||||
task.current_website = None
|
||||
task.current_action = None
|
||||
task.total_articles = 0
|
||||
task.success_count = 0
|
||||
task.failed_count = 0
|
||||
task.result_details = {}
|
||||
else:
|
||||
# 更新任务状态
|
||||
task.status = 'running'
|
||||
task.started_at = timezone.now()
|
||||
|
||||
task.save()
|
||||
|
||||
# 确保任务不在取消集合中
|
||||
self.cancelled_tasks.discard(task_id)
|
||||
|
||||
# 启动后台线程执行任务
|
||||
thread = threading.Thread(target=self._execute_task, args=(task_id,))
|
||||
thread.daemon = True
|
||||
thread.start()
|
||||
|
||||
self.running_tasks[task_id] = thread
|
||||
return True, "任务已启动" + ("(重新执行)" if rerun else "")
|
||||
|
||||
except CrawlTask.DoesNotExist:
|
||||
return False, "任务不存在"
|
||||
except Exception as e:
|
||||
return False, f"启动任务失败: {e}"
|
||||
|
||||
def rerun_task(self, task_id):
|
||||
"""重新执行任务"""
|
||||
return self.start_task(task_id, rerun=True)
|
||||
|
||||
def cancel_task(self, task_id):
|
||||
"""取消任务"""
|
||||
with self.lock:
|
||||
# 将任务标记为已取消
|
||||
self.cancelled_tasks.add(task_id)
|
||||
|
||||
if task_id in self.running_tasks:
|
||||
# 标记任务为取消状态
|
||||
try:
|
||||
task = CrawlTask.objects.get(id=task_id)
|
||||
task.status = 'cancelled'
|
||||
task.completed_at = timezone.now()
|
||||
task.save()
|
||||
|
||||
# 记录执行历史
|
||||
task.add_execution_record(
|
||||
status='cancelled',
|
||||
started_at=task.started_at,
|
||||
completed_at=task.completed_at,
|
||||
error_message='任务被取消'
|
||||
)
|
||||
|
||||
# 移除运行中的任务
|
||||
del self.running_tasks[task_id]
|
||||
return True, "任务已取消"
|
||||
except CrawlTask.DoesNotExist:
|
||||
return False, "任务不存在"
|
||||
else:
|
||||
# 即使任务不在运行中,也标记为已取消
|
||||
try:
|
||||
task = CrawlTask.objects.get(id=task_id)
|
||||
if task.status in ['pending', 'running']:
|
||||
task.status = 'cancelled'
|
||||
task.completed_at = timezone.now()
|
||||
task.save()
|
||||
|
||||
# 记录执行历史
|
||||
task.add_execution_record(
|
||||
status='cancelled',
|
||||
started_at=task.started_at,
|
||||
completed_at=task.completed_at,
|
||||
error_message='任务被取消'
|
||||
)
|
||||
return True, "任务已取消"
|
||||
except CrawlTask.DoesNotExist:
|
||||
pass
|
||||
return False, "任务未在运行中"
|
||||
|
||||
def is_task_cancelled(self, task_id):
|
||||
"""检查任务是否已被取消"""
|
||||
with self.lock:
|
||||
return task_id in self.cancelled_tasks
|
||||
|
||||
def _execute_task(self, task_id):
|
||||
"""执行任务的核心逻辑"""
|
||||
try:
|
||||
task = CrawlTask.objects.get(id=task_id)
|
||||
|
||||
# 检查任务是否已被取消
|
||||
if self.is_task_cancelled(task_id):
|
||||
self._mark_task_cancelled(task_id)
|
||||
return
|
||||
|
||||
# 根据任务类型执行不同的爬取逻辑
|
||||
if task.task_type == 'keyword':
|
||||
self._execute_keyword_task(task)
|
||||
elif task.task_type == 'historical':
|
||||
self._execute_historical_task(task)
|
||||
elif task.task_type == 'full_site':
|
||||
self._execute_full_site_task(task)
|
||||
else:
|
||||
raise ValueError(f"不支持的任务类型: {task.task_type}")
|
||||
|
||||
# 检查任务是否已被取消
|
||||
if self.is_task_cancelled(task_id):
|
||||
self._mark_task_cancelled(task_id)
|
||||
return
|
||||
|
||||
# 任务完成
|
||||
with transaction.atomic():
|
||||
task = CrawlTask.objects.select_for_update().get(id=task_id)
|
||||
task.status = 'completed'
|
||||
task.completed_at = timezone.now()
|
||||
task.progress = 100
|
||||
task.save()
|
||||
|
||||
# 记录执行历史
|
||||
task.add_execution_record(
|
||||
status='completed',
|
||||
started_at=task.started_at,
|
||||
completed_at=task.completed_at
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
# 检查任务是否已被取消
|
||||
if self.is_task_cancelled(task_id):
|
||||
self._mark_task_cancelled(task_id)
|
||||
return
|
||||
|
||||
# 任务失败
|
||||
try:
|
||||
with transaction.atomic():
|
||||
task = CrawlTask.objects.select_for_update().get(id=task_id)
|
||||
task.status = 'failed'
|
||||
task.completed_at = timezone.now()
|
||||
task.error_message = str(e)
|
||||
task.save()
|
||||
|
||||
# 记录执行历史
|
||||
task.add_execution_record(
|
||||
status='failed',
|
||||
started_at=task.started_at,
|
||||
completed_at=task.completed_at,
|
||||
error_message=str(e)
|
||||
)
|
||||
except:
|
||||
pass
|
||||
|
||||
finally:
|
||||
# 清理运行中的任务记录
|
||||
with self.lock:
|
||||
if task_id in self.running_tasks:
|
||||
del self.running_tasks[task_id]
|
||||
# 从取消集合中移除任务
|
||||
self.cancelled_tasks.discard(task_id)
|
||||
|
||||
def _mark_task_cancelled(self, task_id):
|
||||
"""标记任务为已取消"""
|
||||
try:
|
||||
with transaction.atomic():
|
||||
task = CrawlTask.objects.select_for_update().get(id=task_id)
|
||||
task.status = 'cancelled'
|
||||
task.completed_at = timezone.now()
|
||||
task.save()
|
||||
|
||||
# 记录执行历史
|
||||
task.add_execution_record(
|
||||
status='cancelled',
|
||||
started_at=task.started_at,
|
||||
completed_at=task.completed_at,
|
||||
error_message='任务被取消'
|
||||
)
|
||||
except CrawlTask.DoesNotExist:
|
||||
pass
|
||||
|
||||
def _execute_keyword_task(self, task):
|
||||
"""执行关键词搜索任务"""
|
||||
task_id = task.id
|
||||
|
||||
# 检查任务是否已被取消
|
||||
if self.is_task_cancelled(task_id):
|
||||
self._mark_task_cancelled(task_id)
|
||||
return
|
||||
|
||||
# 更新当前操作
|
||||
task.current_action = "开始关键词搜索"
|
||||
task.save()
|
||||
|
||||
# 准备参数
|
||||
selected_websites = task.websites.all()
|
||||
if selected_websites:
|
||||
websites = [w.name for w in selected_websites]
|
||||
else:
|
||||
websites = list(WEBSITE_CRAWL_CONFIGS.keys())
|
||||
|
||||
start_date = task.start_date.strftime('%Y-%m-%d') if task.start_date else None
|
||||
end_date = task.end_date.strftime('%Y-%m-%d') if task.end_date else None
|
||||
|
||||
# 设置任务ID,以便在爬虫函数中检查取消状态
|
||||
crawl_by_keyword.task_id = task_id
|
||||
|
||||
# 使用新的关键词爬虫引擎
|
||||
try:
|
||||
# 延迟导入以避免循环依赖
|
||||
from core.keyword_crawler import KeywordCrawler
|
||||
crawler = KeywordCrawler(task_id, self)
|
||||
crawler.run()
|
||||
|
||||
# 检查任务是否已被取消
|
||||
if self.is_task_cancelled(task_id):
|
||||
self._mark_task_cancelled(task_id)
|
||||
return
|
||||
|
||||
# 检查任务是否已被取消
|
||||
if self.is_task_cancelled(task_id):
|
||||
self._mark_task_cancelled(task_id)
|
||||
return
|
||||
|
||||
# 更新结果统计
|
||||
task = CrawlTask.objects.get(id=task_id)
|
||||
if task.status == 'completed':
|
||||
# 统计爬取的文章数量
|
||||
from core.models import Article
|
||||
article_count = Article.objects.filter(website__in=task.websites.all()).count()
|
||||
task.total_articles = article_count
|
||||
task.success_count = article_count
|
||||
task.failed_count = 0
|
||||
task.result_details = {
|
||||
'total_articles': article_count,
|
||||
'success_count': article_count,
|
||||
'failed_count': 0,
|
||||
'keyword': task.keyword,
|
||||
'websites': [w.name for w in task.websites.all()]
|
||||
}
|
||||
task.save()
|
||||
|
||||
# 添加执行记录
|
||||
task.add_execution_record(
|
||||
status='completed',
|
||||
started_at=task.started_at,
|
||||
completed_at=task.completed_at
|
||||
)
|
||||
elif self.is_task_cancelled(task_id):
|
||||
self._mark_task_cancelled(task_id)
|
||||
return
|
||||
|
||||
except Exception as e:
|
||||
if self.is_task_cancelled(task_id):
|
||||
self._mark_task_cancelled(task_id)
|
||||
return
|
||||
|
||||
# 更新任务状态为失败
|
||||
task = CrawlTask.objects.get(id=task_id)
|
||||
task.status = 'failed'
|
||||
task.error_message = str(e)
|
||||
task.completed_at = timezone.now()
|
||||
task.save()
|
||||
|
||||
# 添加执行记录
|
||||
task.add_execution_record(
|
||||
status='failed',
|
||||
started_at=task.started_at,
|
||||
completed_at=task.completed_at,
|
||||
error_message=str(e)
|
||||
)
|
||||
|
||||
raise e
|
||||
|
||||
def _execute_historical_task(self, task):
|
||||
"""执行历史文章任务"""
|
||||
task_id = task.id
|
||||
|
||||
# 检查任务是否已被取消
|
||||
if self.is_task_cancelled(task_id):
|
||||
self._mark_task_cancelled(task_id)
|
||||
return
|
||||
|
||||
# 更新当前操作
|
||||
task.current_action = "开始历史文章爬取"
|
||||
task.save()
|
||||
|
||||
# 准备参数
|
||||
selected_websites = task.websites.all()
|
||||
if selected_websites:
|
||||
websites = [w.name for w in selected_websites]
|
||||
else:
|
||||
websites = list(WEBSITE_CRAWL_CONFIGS.keys())
|
||||
|
||||
start_date = task.start_date.strftime('%Y-%m-%d') if task.start_date else None
|
||||
end_date = task.end_date.strftime('%Y-%m-%d') if task.end_date else None
|
||||
|
||||
# 设置任务ID,以便在爬虫函数中检查取消状态
|
||||
crawl_historical_articles.task_id = task_id
|
||||
|
||||
# 执行爬取
|
||||
try:
|
||||
results = crawl_historical_articles(
|
||||
website_names=websites,
|
||||
start_date=start_date,
|
||||
end_date=end_date,
|
||||
max_articles_per_site=task.max_articles
|
||||
)
|
||||
except Exception as e:
|
||||
if self.is_task_cancelled(task_id):
|
||||
self._mark_task_cancelled(task_id)
|
||||
return
|
||||
raise e
|
||||
|
||||
# 检查任务是否已被取消
|
||||
if self.is_task_cancelled(task_id):
|
||||
self._mark_task_cancelled(task_id)
|
||||
return
|
||||
|
||||
# 更新结果
|
||||
task.total_articles = results['total_articles']
|
||||
task.success_count = results['success_count']
|
||||
task.failed_count = results['failed_count']
|
||||
task.result_details = results['website_results']
|
||||
task.save()
|
||||
|
||||
def _execute_full_site_task(self, task):
|
||||
"""执行全站爬取任务"""
|
||||
task_id = task.id
|
||||
|
||||
# 检查任务是否已被取消
|
||||
if self.is_task_cancelled(task_id):
|
||||
self._mark_task_cancelled(task_id)
|
||||
return
|
||||
|
||||
# 更新当前操作
|
||||
task.current_action = "开始全站爬取"
|
||||
task.save()
|
||||
|
||||
# 准备参数
|
||||
selected_websites = task.websites.all()
|
||||
if selected_websites:
|
||||
websites = [w.name for w in selected_websites]
|
||||
else:
|
||||
websites = list(WEBSITE_CRAWL_CONFIGS.keys())
|
||||
|
||||
total_websites = len(websites)
|
||||
completed_websites = 0
|
||||
|
||||
for website_name in websites:
|
||||
# 检查任务是否已被取消
|
||||
if self.is_task_cancelled(task_id):
|
||||
self._mark_task_cancelled(task_id)
|
||||
return
|
||||
|
||||
try:
|
||||
# 更新当前网站
|
||||
task.current_website = website_name
|
||||
task.current_action = f"正在爬取 {website_name}"
|
||||
task.save()
|
||||
|
||||
# 获取或创建网站对象
|
||||
from core.models import Website
|
||||
website, created = Website.objects.get_or_create(
|
||||
name=website_name,
|
||||
defaults={
|
||||
'base_url': WEBSITE_CRAWL_CONFIGS[website_name]["base_url"],
|
||||
'enabled': True
|
||||
}
|
||||
)
|
||||
|
||||
# 设置任务ID,以便在爬虫函数中检查取消状态
|
||||
full_site_crawler.task_id = task_id
|
||||
|
||||
# 执行全站爬取
|
||||
try:
|
||||
full_site_crawler(
|
||||
WEBSITE_CRAWL_CONFIGS[website_name]["base_url"],
|
||||
website,
|
||||
max_pages=task.max_pages
|
||||
)
|
||||
except Exception as e:
|
||||
if self.is_task_cancelled(task_id):
|
||||
self._mark_task_cancelled(task_id)
|
||||
return
|
||||
raise e
|
||||
|
||||
completed_websites += 1
|
||||
progress = int((completed_websites / total_websites) * 100)
|
||||
task.progress = progress
|
||||
task.save()
|
||||
|
||||
except Exception as e:
|
||||
# 检查任务是否已被取消
|
||||
if self.is_task_cancelled(task_id):
|
||||
self._mark_task_cancelled(task_id)
|
||||
return
|
||||
# 记录错误但继续处理其他网站
|
||||
print(f"爬取网站 {website_name} 时出错: {e}")
|
||||
continue
|
||||
|
||||
# 检查任务是否已被取消
|
||||
if self.is_task_cancelled(task_id):
|
||||
self._mark_task_cancelled(task_id)
|
||||
return
|
||||
|
||||
# 更新最终结果
|
||||
task.total_articles = completed_websites # 这里可以改为实际爬取的文章数
|
||||
task.success_count = completed_websites
|
||||
task.failed_count = total_websites - completed_websites
|
||||
task.save()
|
||||
|
||||
def get_task_status(self, task_id):
|
||||
"""获取任务状态"""
|
||||
try:
|
||||
task = CrawlTask.objects.get(id=task_id)
|
||||
return {
|
||||
'status': task.status,
|
||||
'progress': task.progress,
|
||||
'current_website': task.current_website,
|
||||
'current_action': task.current_action,
|
||||
'total_articles': task.total_articles,
|
||||
'success_count': task.success_count,
|
||||
'failed_count': task.failed_count,
|
||||
'error_message': task.error_message
|
||||
}
|
||||
except CrawlTask.DoesNotExist:
|
||||
return None
|
||||
|
||||
|
||||
# 全局任务执行器实例
|
||||
task_executor = TaskExecutor()
|
||||
227
core/tasks.py
Normal file
227
core/tasks.py
Normal file
@@ -0,0 +1,227 @@
|
||||
import logging
|
||||
from celery import shared_task
|
||||
from django.core.management import call_command
|
||||
# from django.conf import settings
|
||||
from .models import Website, Article
|
||||
from .utils import full_site_crawler
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@shared_task(bind=True, max_retries=3)
|
||||
def crawl_website(self, website_id, node_id=None, batch_id=None):
|
||||
"""
|
||||
爬取单个网站的任务
|
||||
"""
|
||||
try:
|
||||
website = Website.objects.get(id=website_id)
|
||||
logger.info(f"开始爬取网站: {website.name} (节点: {node_id}, 批次: {batch_id})")
|
||||
logger.info(f"网站URL: {website.base_url}")
|
||||
|
||||
# 记录任务开始
|
||||
if node_id and batch_id:
|
||||
from .distributed_crawler import distributed_crawler
|
||||
distributed_crawler.heartbeat(node_id, 1)
|
||||
logger.info(f"分布式爬虫心跳已发送 - 节点: {node_id}, 状态: 1")
|
||||
|
||||
# 调用爬虫函数
|
||||
logger.info(f"开始调用 full_site_crawler 函数处理网站: {website.name}")
|
||||
full_site_crawler(website.base_url, website, max_pages=100)
|
||||
logger.info(f"完成调用 full_site_crawler 函数处理网站: {website.name}")
|
||||
|
||||
# 统计结果
|
||||
article_count = website.article_set.count()
|
||||
logger.info(f"网站 {website.name} 爬取完成,共 {article_count} 篇文章")
|
||||
|
||||
# 记录任务完成
|
||||
if node_id and batch_id:
|
||||
distributed_crawler.heartbeat(node_id, 0)
|
||||
logger.info(f"分布式爬虫心跳已发送 - 节点: {node_id}, 状态: 0")
|
||||
|
||||
result = {
|
||||
'website_id': website_id,
|
||||
'website_name': website.name,
|
||||
'article_count': article_count,
|
||||
'status': 'success',
|
||||
'node_id': node_id,
|
||||
'batch_id': batch_id
|
||||
}
|
||||
logger.info(f"任务完成,返回结果: {result}")
|
||||
return result
|
||||
|
||||
except Website.DoesNotExist:
|
||||
error_msg = f"网站不存在: {website_id}"
|
||||
logger.error(error_msg)
|
||||
raise
|
||||
except Exception as exc:
|
||||
error_msg = f"爬取网站 {website_id} 失败: {exc}"
|
||||
logger.error(error_msg)
|
||||
# 重试任务
|
||||
logger.info(f"准备重试任务,将在5分钟后重试")
|
||||
raise self.retry(exc=exc, countdown=60 * 5) # 5分钟后重试
|
||||
|
||||
|
||||
@shared_task(bind=True, max_retries=3)
|
||||
def crawl_all_websites(self):
|
||||
"""
|
||||
爬取所有网站的任务
|
||||
"""
|
||||
try:
|
||||
logger.info("开始批量爬取所有网站")
|
||||
|
||||
# 获取所有启用的网站
|
||||
websites = Website.objects.filter(enabled=True)
|
||||
total_websites = websites.count()
|
||||
logger.info(f"找到 {total_websites} 个启用的网站")
|
||||
|
||||
results = []
|
||||
for website in websites:
|
||||
try:
|
||||
logger.info(f"启动网站 {website.name} 的爬取任务")
|
||||
# 调用单个网站爬取任务
|
||||
result = crawl_website.delay(website.id)
|
||||
logger.info(f"网站 {website.name} 的爬取任务已启动,任务ID: {result.id}")
|
||||
results.append({
|
||||
'website_id': website.id,
|
||||
'website_name': website.name,
|
||||
'task_id': result.id
|
||||
})
|
||||
except Exception as e:
|
||||
error_msg = f"启动网站 {website.name} 爬取任务失败: {e}"
|
||||
logger.error(error_msg)
|
||||
results.append({
|
||||
'website_id': website.id,
|
||||
'website_name': website.name,
|
||||
'error': str(e)
|
||||
})
|
||||
|
||||
logger.info(f"批量爬取任务启动完成,共 {total_websites} 个网站")
|
||||
|
||||
return {
|
||||
'total_websites': total_websites,
|
||||
'results': results,
|
||||
'status': 'started'
|
||||
}
|
||||
|
||||
except Exception as exc:
|
||||
error_msg = f"批量爬取任务失败: {exc}"
|
||||
logger.error(error_msg)
|
||||
raise self.retry(exc=exc, countdown=60 * 10) # 10分钟后重试
|
||||
|
||||
|
||||
@shared_task
|
||||
def crawl_specific_media(media_list):
|
||||
"""
|
||||
爬取指定媒体的任务
|
||||
"""
|
||||
try:
|
||||
logger.info(f"开始爬取指定媒体: {media_list}")
|
||||
|
||||
# 调用管理命令
|
||||
logger.info("调用 crawl_all_media 管理命令")
|
||||
call_command('crawl_all_media', media=','.join(media_list))
|
||||
logger.info("crawl_all_media 管理命令执行完成")
|
||||
|
||||
return {
|
||||
'media_list': media_list,
|
||||
'status': 'success'
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"爬取指定媒体失败: {e}"
|
||||
logger.error(error_msg)
|
||||
raise
|
||||
|
||||
|
||||
@shared_task
|
||||
def cleanup_old_articles(days=30):
|
||||
"""
|
||||
清理旧文章的任务
|
||||
"""
|
||||
try:
|
||||
from django.utils import timezone
|
||||
from datetime import timedelta
|
||||
|
||||
cutoff_date = timezone.now() - timedelta(days=days)
|
||||
logger.info(f"查找 {days} 天前的文章,截止日期: {cutoff_date}")
|
||||
old_articles = Article.objects.filter(created_at__lt=cutoff_date)
|
||||
count = old_articles.count()
|
||||
logger.info(f"找到 {count} 篇旧文章")
|
||||
|
||||
old_articles.delete()
|
||||
logger.info(f"已删除 {count} 篇旧文章")
|
||||
|
||||
logger.info(f"清理了 {count} 篇旧文章({days}天前)")
|
||||
|
||||
return {
|
||||
'deleted_count': count,
|
||||
'cutoff_date': cutoff_date.isoformat(),
|
||||
'status': 'success'
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"清理旧文章失败: {e}"
|
||||
logger.error(error_msg)
|
||||
raise
|
||||
|
||||
|
||||
@shared_task
|
||||
def export_articles():
|
||||
"""
|
||||
导出文章的任务
|
||||
"""
|
||||
try:
|
||||
logger.info("开始导出文章")
|
||||
|
||||
# 调用导出命令
|
||||
logger.info("调用 export_articles 管理命令")
|
||||
call_command('export_articles')
|
||||
logger.info("export_articles 管理命令执行完成")
|
||||
|
||||
return {
|
||||
'status': 'success',
|
||||
'message': '文章导出完成'
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"导出文章失败: {e}"
|
||||
logger.error(error_msg)
|
||||
raise
|
||||
|
||||
|
||||
@shared_task
|
||||
def health_check():
|
||||
"""
|
||||
健康检查任务
|
||||
"""
|
||||
try:
|
||||
logger.info("开始执行健康检查")
|
||||
# 检查数据库连接
|
||||
website_count = Website.objects.count()
|
||||
article_count = Article.objects.count()
|
||||
logger.info(f"数据库状态正常 - 网站数量: {website_count}, 文章数量: {article_count}")
|
||||
|
||||
# 检查Redis连接
|
||||
from django.core.cache import cache
|
||||
logger.info("检查Redis连接")
|
||||
cache.set('health_check', 'ok', 60)
|
||||
cache_result = cache.get('health_check')
|
||||
logger.info(f"Redis连接状态: {'正常' if cache_result == 'ok' else '异常'}")
|
||||
|
||||
result = {
|
||||
'database': 'ok',
|
||||
'redis': 'ok' if cache_result == 'ok' else 'error',
|
||||
'website_count': website_count,
|
||||
'article_count': article_count,
|
||||
'status': 'healthy'
|
||||
}
|
||||
logger.info(f"健康检查完成,结果: {result}")
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"健康检查失败: {e}"
|
||||
logger.error(error_msg)
|
||||
return {
|
||||
'status': 'unhealthy',
|
||||
'error': str(e)
|
||||
}
|
||||
21
core/templates/admin/core/article/change_list.html
Normal file
21
core/templates/admin/core/article/change_list.html
Normal file
@@ -0,0 +1,21 @@
|
||||
{% extends "admin/change_list.html" %}
|
||||
{% load admin_urls %}
|
||||
|
||||
{% block object-tools %}
|
||||
{{ block.super }}
|
||||
<!--
|
||||
<div style="margin-top: 10px;">
|
||||
<form method="post" action="{% url 'admin:run_crawler' %}" style="display: inline-block;">
|
||||
{% csrf_token %}
|
||||
<label for="website-select">选择网站:</label>
|
||||
<select name="website_name" id="website-select" required>
|
||||
<option value="">-- 请选择网站 --</option>
|
||||
{% for website in cl.model_admin.get_websites %}
|
||||
<option value="{{ website.name }}">{{ website.name }}</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
<input type="submit" value="执行爬虫" class="default" style="margin-left: 10px;"/>
|
||||
</form>
|
||||
</div>
|
||||
-->
|
||||
{% endblock %}
|
||||
304
core/templates/admin/crawler_status.html
Normal file
304
core/templates/admin/crawler_status.html
Normal file
@@ -0,0 +1,304 @@
|
||||
{% extends "admin/base_site.html" %}
|
||||
{% load static %}
|
||||
|
||||
{% block title %}爬虫状态 - {{ site_title|default:_('Django site admin') }}{% endblock %}
|
||||
|
||||
{% block extrastyle %}
|
||||
<style>
|
||||
.status-card {
|
||||
background: white;
|
||||
border: 1px solid #ddd;
|
||||
border-radius: 8px;
|
||||
padding: 20px;
|
||||
margin: 20px 0;
|
||||
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
|
||||
}
|
||||
|
||||
.status-header {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
margin-bottom: 20px;
|
||||
padding-bottom: 10px;
|
||||
border-bottom: 2px solid #f0f0f0;
|
||||
}
|
||||
|
||||
.status-title {
|
||||
font-size: 24px;
|
||||
font-weight: bold;
|
||||
color: #333;
|
||||
}
|
||||
|
||||
.stats-grid {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
|
||||
gap: 20px;
|
||||
margin-bottom: 30px;
|
||||
}
|
||||
|
||||
.stat-card {
|
||||
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
||||
color: white;
|
||||
padding: 20px;
|
||||
border-radius: 8px;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.stat-number {
|
||||
font-size: 32px;
|
||||
font-weight: bold;
|
||||
margin-bottom: 5px;
|
||||
}
|
||||
|
||||
.stat-label {
|
||||
font-size: 14px;
|
||||
opacity: 0.9;
|
||||
}
|
||||
|
||||
.nodes-section, .batches-section {
|
||||
margin-top: 30px;
|
||||
}
|
||||
|
||||
.section-title {
|
||||
font-size: 20px;
|
||||
font-weight: bold;
|
||||
margin-bottom: 15px;
|
||||
color: #333;
|
||||
}
|
||||
|
||||
.node-item, .batch-item {
|
||||
background: #f8f9fa;
|
||||
border: 1px solid #e9ecef;
|
||||
border-radius: 6px;
|
||||
padding: 15px;
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
|
||||
.node-header, .batch-header {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
|
||||
.node-name, .batch-id {
|
||||
font-weight: bold;
|
||||
color: #333;
|
||||
}
|
||||
|
||||
.node-status, .batch-status {
|
||||
padding: 4px 8px;
|
||||
border-radius: 4px;
|
||||
font-size: 12px;
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
.status-active {
|
||||
background: #d4edda;
|
||||
color: #155724;
|
||||
}
|
||||
|
||||
.status-running {
|
||||
background: #fff3cd;
|
||||
color: #856404;
|
||||
}
|
||||
|
||||
.status-completed {
|
||||
background: #d1ecf1;
|
||||
color: #0c5460;
|
||||
}
|
||||
|
||||
.status-failed {
|
||||
background: #f8d7da;
|
||||
color: #721c24;
|
||||
}
|
||||
|
||||
.node-details, .batch-details {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
|
||||
gap: 10px;
|
||||
font-size: 14px;
|
||||
}
|
||||
|
||||
.detail-item {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
}
|
||||
|
||||
.detail-label {
|
||||
color: #666;
|
||||
}
|
||||
|
||||
.detail-value {
|
||||
font-weight: bold;
|
||||
color: #333;
|
||||
}
|
||||
|
||||
.progress-bar {
|
||||
width: 100%;
|
||||
height: 8px;
|
||||
background: #e9ecef;
|
||||
border-radius: 4px;
|
||||
overflow: hidden;
|
||||
margin-top: 10px;
|
||||
}
|
||||
|
||||
.progress-fill {
|
||||
height: 100%;
|
||||
background: linear-gradient(90deg, #28a745, #20c997);
|
||||
transition: width 0.3s ease;
|
||||
}
|
||||
|
||||
.refresh-btn {
|
||||
background: #007bff;
|
||||
color: white;
|
||||
border: none;
|
||||
padding: 8px 16px;
|
||||
border-radius: 4px;
|
||||
cursor: pointer;
|
||||
font-size: 14px;
|
||||
}
|
||||
|
||||
.refresh-btn:hover {
|
||||
background: #0056b3;
|
||||
}
|
||||
|
||||
.no-data {
|
||||
text-align: center;
|
||||
color: #666;
|
||||
padding: 40px;
|
||||
font-style: italic;
|
||||
}
|
||||
</style>
|
||||
{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<div class="status-card">
|
||||
<div class="status-header">
|
||||
<h1 class="status-title">爬虫状态监控</h1>
|
||||
<button class="refresh-btn" onclick="location.reload()">刷新</button>
|
||||
</div>
|
||||
|
||||
<!-- 统计卡片 -->
|
||||
<div class="stats-grid">
|
||||
<div class="stat-card">
|
||||
<div class="stat-number">{{ task_stats.total_nodes }}</div>
|
||||
<div class="stat-label">活跃节点</div>
|
||||
</div>
|
||||
<div class="stat-card">
|
||||
<div class="stat-number">{{ task_stats.active_tasks }}</div>
|
||||
<div class="stat-label">运行中任务</div>
|
||||
</div>
|
||||
<div class="stat-card">
|
||||
<div class="stat-number">{{ task_stats.total_batches }}</div>
|
||||
<div class="stat-label">总批次</div>
|
||||
</div>
|
||||
<div class="stat-card">
|
||||
<div class="stat-number">{{ nodes|length }}</div>
|
||||
<div class="stat-label">在线节点</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- 节点状态 -->
|
||||
<div class="nodes-section">
|
||||
<h2 class="section-title">爬虫节点状态</h2>
|
||||
{% if nodes %}
|
||||
{% for node in nodes %}
|
||||
<div class="node-item">
|
||||
<div class="node-header">
|
||||
<span class="node-name">{{ node.node_id }}</span>
|
||||
<span class="node-status status-active">{{ node.status }}</span>
|
||||
</div>
|
||||
<div class="node-details">
|
||||
<div class="detail-item">
|
||||
<span class="detail-label">活跃任务:</span>
|
||||
<span class="detail-value">{{ node.active_tasks }}</span>
|
||||
</div>
|
||||
<div class="detail-item">
|
||||
<span class="detail-label">完成任务:</span>
|
||||
<span class="detail-value">{{ node.completed_tasks }}</span>
|
||||
</div>
|
||||
<div class="detail-item">
|
||||
<span class="detail-label">失败任务:</span>
|
||||
<span class="detail-value">{{ node.failed_tasks }}</span>
|
||||
</div>
|
||||
<div class="detail-item">
|
||||
<span class="detail-label">最后心跳:</span>
|
||||
<span class="detail-value">
|
||||
{% if node.last_heartbeat %}
|
||||
{{ node.last_heartbeat|date:"H:i:s" }}
|
||||
{% else %}
|
||||
未知
|
||||
{% endif %}
|
||||
</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{% endfor %}
|
||||
{% else %}
|
||||
<div class="no-data">
|
||||
暂无活跃的爬虫节点
|
||||
</div>
|
||||
{% endif %}
|
||||
</div>
|
||||
|
||||
<!-- 批次状态 -->
|
||||
<div class="batches-section">
|
||||
<h2 class="section-title">最近批次</h2>
|
||||
{% if batches %}
|
||||
{% for batch in batches %}
|
||||
<div class="batch-item">
|
||||
<div class="batch-header">
|
||||
<span class="batch-id">{{ batch.batch_id }}</span>
|
||||
<span class="batch-status status-{{ batch.status }}">
|
||||
{% if batch.status == 'running' %}
|
||||
运行中
|
||||
{% elif batch.status == 'completed' %}
|
||||
已完成
|
||||
{% elif batch.status == 'failed' %}
|
||||
失败
|
||||
{% else %}
|
||||
{{ batch.status }}
|
||||
{% endif %}
|
||||
</span>
|
||||
</div>
|
||||
<div class="batch-details">
|
||||
<div class="detail-item">
|
||||
<span class="detail-label">总任务:</span>
|
||||
<span class="detail-value">{{ batch.total_tasks }}</span>
|
||||
</div>
|
||||
<div class="detail-item">
|
||||
<span class="detail-label">已完成:</span>
|
||||
<span class="detail-value">{{ batch.completed_tasks }}</span>
|
||||
</div>
|
||||
<div class="detail-item">
|
||||
<span class="detail-label">失败:</span>
|
||||
<span class="detail-value">{{ batch.failed_tasks }}</span>
|
||||
</div>
|
||||
<div class="detail-item">
|
||||
<span class="detail-label">进度:</span>
|
||||
<span class="detail-value">{{ batch.progress|floatformat:1 }}%</span>
|
||||
</div>
|
||||
</div>
|
||||
{% if batch.status == 'running' %}
|
||||
<div class="progress-bar">
|
||||
<div class="progress-fill" style="width: {{ batch.progress }}%"></div>
|
||||
</div>
|
||||
{% endif %}
|
||||
</div>
|
||||
{% endfor %}
|
||||
{% else %}
|
||||
<div class="no-data">
|
||||
暂无批次记录
|
||||
</div>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
// 自动刷新页面
|
||||
setTimeout(function () {
|
||||
location.reload();
|
||||
}, 30000); // 30秒刷新一次
|
||||
</script>
|
||||
{% endblock %}
|
||||
139
core/templates/admin/create_full_site_task.html
Normal file
139
core/templates/admin/create_full_site_task.html
Normal file
@@ -0,0 +1,139 @@
|
||||
{% extends "admin/base_site.html" %}
|
||||
{% load i18n admin_urls static admin_modify %}
|
||||
|
||||
{% block title %}{{ title }} | {{ site_title|default:_('Django site admin') }}{% endblock %}
|
||||
|
||||
{% block breadcrumbs %}
|
||||
<div class="breadcrumbs">
|
||||
<a href="{% url 'admin:index' %}">{% trans 'Home' %}</a>
|
||||
› <a href="{% url 'admin:core_crawltask_changelist' %}">爬取任务</a>
|
||||
› {{ title }}
|
||||
</div>
|
||||
{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<h1>{{ title }}</h1>
|
||||
|
||||
<div class="help" style="background: #fff3cd; border: 1px solid #ffeaa7; padding: 15px; margin-bottom: 20px; border-radius: 5px;">
|
||||
<strong>注意:</strong>全站爬取会爬取整个网站的所有文章,可能需要很长时间。建议在非高峰时段进行。
|
||||
</div>
|
||||
|
||||
<form method="post" id="full-site-task-form">
|
||||
{% csrf_token %}
|
||||
|
||||
<fieldset class="module aligned">
|
||||
<h2>基本信息</h2>
|
||||
<div class="form-row">
|
||||
<div>
|
||||
<label for="id_name" class="required">任务名称:</label>
|
||||
<input type="text" name="name" id="id_name" required maxlength="200" style="width: 300px;">
|
||||
<p class="help">为这个全站爬取任务起一个容易识别的名称</p>
|
||||
</div>
|
||||
</div>
|
||||
</fieldset>
|
||||
|
||||
<fieldset class="module aligned">
|
||||
<h2>目标网站</h2>
|
||||
<div class="form-row">
|
||||
<div>
|
||||
<label>选择要爬取的网站:</label>
|
||||
<div style="max-height: 200px; overflow-y: auto; border: 1px solid #ddd; padding: 10px; margin-top: 5px;">
|
||||
<label style="display: block; margin: 5px 0;">
|
||||
<input type="checkbox" id="select_all" onchange="toggleAllWebsites()">
|
||||
<strong>全选/取消全选</strong>
|
||||
</label>
|
||||
<hr style="margin: 10px 0;">
|
||||
{% for website in websites %}
|
||||
<label style="display: block; margin: 3px 0;">
|
||||
<input type="checkbox" name="websites" value="{{ website.name }}" class="website-checkbox">
|
||||
{{ website.name }}
|
||||
</label>
|
||||
{% endfor %}
|
||||
</div>
|
||||
<p class="help">不选择任何网站将爬取所有支持的网站</p>
|
||||
</div>
|
||||
</div>
|
||||
</fieldset>
|
||||
|
||||
<fieldset class="module aligned">
|
||||
<h2>爬取设置</h2>
|
||||
<div class="form-row">
|
||||
<div>
|
||||
<label for="id_max_pages">最大爬取页数:</label>
|
||||
<input type="number" name="max_pages" id="id_max_pages" value="500" min="1" max="5000" style="width: 100px;">
|
||||
<p class="help">每个网站最多爬取的页数 (1-5000)</p>
|
||||
</div>
|
||||
</div>
|
||||
</fieldset>
|
||||
|
||||
<div class="submit-row">
|
||||
<input type="submit" value="创建任务" class="default" name="_save">
|
||||
<a href="{% url 'admin:core_crawltask_changelist' %}" class="button cancel-link">取消</a>
|
||||
</div>
|
||||
</form>
|
||||
|
||||
<script>
|
||||
function toggleAllWebsites() {
|
||||
const selectAll = document.getElementById('select_all');
|
||||
const checkboxes = document.querySelectorAll('.website-checkbox');
|
||||
|
||||
checkboxes.forEach(checkbox => {
|
||||
checkbox.checked = selectAll.checked;
|
||||
});
|
||||
}
|
||||
</script>
|
||||
|
||||
<style>
|
||||
.form-row {
|
||||
margin-bottom: 15px;
|
||||
}
|
||||
|
||||
.form-row label {
|
||||
display: block;
|
||||
font-weight: bold;
|
||||
margin-bottom: 5px;
|
||||
}
|
||||
|
||||
.form-row input[type="text"],
|
||||
.form-row input[type="number"] {
|
||||
padding: 5px;
|
||||
border: 1px solid #ddd;
|
||||
border-radius: 3px;
|
||||
}
|
||||
|
||||
.form-row .help {
|
||||
color: #666;
|
||||
font-size: 12px;
|
||||
margin-top: 3px;
|
||||
}
|
||||
|
||||
.submit-row {
|
||||
margin-top: 20px;
|
||||
padding-top: 20px;
|
||||
border-top: 1px solid #ddd;
|
||||
}
|
||||
|
||||
.submit-row input[type="submit"] {
|
||||
background: #417690;
|
||||
color: white;
|
||||
padding: 10px 20px;
|
||||
border: none;
|
||||
border-radius: 3px;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
.submit-row .cancel-link {
|
||||
margin-left: 10px;
|
||||
padding: 10px 20px;
|
||||
background: #f8f8f8;
|
||||
color: #333;
|
||||
text-decoration: none;
|
||||
border-radius: 3px;
|
||||
border: 1px solid #ddd;
|
||||
}
|
||||
|
||||
.submit-row .cancel-link:hover {
|
||||
background: #e8e8e8;
|
||||
}
|
||||
</style>
|
||||
{% endblock %}
|
||||
164
core/templates/admin/create_historical_task.html
Normal file
164
core/templates/admin/create_historical_task.html
Normal file
@@ -0,0 +1,164 @@
|
||||
{% extends "admin/base_site.html" %}
|
||||
{% load i18n admin_urls static admin_modify %}
|
||||
|
||||
{% block title %}{{ title }} | {{ site_title|default:_('Django site admin') }}{% endblock %}
|
||||
|
||||
{% block breadcrumbs %}
|
||||
<div class="breadcrumbs">
|
||||
<a href="{% url 'admin:index' %}">{% trans 'Home' %}</a>
|
||||
› <a href="{% url 'admin:core_crawltask_changelist' %}">爬取任务</a>
|
||||
› {{ title }}
|
||||
</div>
|
||||
{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<h1>{{ title }}</h1>
|
||||
|
||||
<form method="post" id="historical-task-form">
|
||||
{% csrf_token %}
|
||||
|
||||
<fieldset class="module aligned">
|
||||
<h2>基本信息</h2>
|
||||
<div class="form-row">
|
||||
<div>
|
||||
<label for="id_name" class="required">任务名称:</label>
|
||||
<input type="text" name="name" id="id_name" required maxlength="200" style="width: 300px;">
|
||||
<p class="help">为这个历史文章爬取任务起一个容易识别的名称</p>
|
||||
</div>
|
||||
</div>
|
||||
</fieldset>
|
||||
|
||||
<fieldset class="module aligned">
|
||||
<h2>目标网站</h2>
|
||||
<div class="form-row">
|
||||
<div>
|
||||
<label>选择要爬取的网站:</label>
|
||||
<div style="max-height: 200px; overflow-y: auto; border: 1px solid #ddd; padding: 10px; margin-top: 5px;">
|
||||
<label style="display: block; margin: 5px 0;">
|
||||
<input type="checkbox" id="select_all" onchange="toggleAllWebsites()">
|
||||
<strong>全选/取消全选</strong>
|
||||
</label>
|
||||
<hr style="margin: 10px 0;">
|
||||
{% for website in websites %}
|
||||
<label style="display: block; margin: 3px 0;">
|
||||
<input type="checkbox" name="websites" value="{{ website.name }}" class="website-checkbox">
|
||||
{{ website.name }}
|
||||
</label>
|
||||
{% endfor %}
|
||||
</div>
|
||||
<p class="help">不选择任何网站将爬取所有支持的网站</p>
|
||||
</div>
|
||||
</div>
|
||||
</fieldset>
|
||||
|
||||
<fieldset class="module aligned">
|
||||
<h2>时间范围</h2>
|
||||
<div class="form-row">
|
||||
<div>
|
||||
<label for="id_start_date" class="required">开始日期:</label>
|
||||
<input type="date" name="start_date" id="id_start_date" required>
|
||||
<p class="help">历史文章的开始日期</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="form-row">
|
||||
<div>
|
||||
<label for="id_end_date" class="required">结束日期:</label>
|
||||
<input type="date" name="end_date" id="id_end_date" required>
|
||||
<p class="help">历史文章的结束日期</p>
|
||||
</div>
|
||||
</div>
|
||||
</fieldset>
|
||||
|
||||
<fieldset class="module aligned">
|
||||
<h2>爬取设置</h2>
|
||||
<div class="form-row">
|
||||
<div>
|
||||
<label for="id_max_articles">每个网站最大文章数:</label>
|
||||
<input type="number" name="max_articles" id="id_max_articles" value="50" min="1" max="500" style="width: 100px;">
|
||||
<p class="help">每个网站最多爬取的文章数量 (1-500)</p>
|
||||
</div>
|
||||
</div>
|
||||
</fieldset>
|
||||
|
||||
<div class="submit-row">
|
||||
<input type="submit" value="创建任务" class="default" name="_save">
|
||||
<a href="{% url 'admin:core_crawltask_changelist' %}" class="button cancel-link">取消</a>
|
||||
</div>
|
||||
</form>
|
||||
|
||||
<script>
|
||||
function toggleAllWebsites() {
|
||||
const selectAll = document.getElementById('select_all');
|
||||
const checkboxes = document.querySelectorAll('.website-checkbox');
|
||||
|
||||
checkboxes.forEach(checkbox => {
|
||||
checkbox.checked = selectAll.checked;
|
||||
});
|
||||
}
|
||||
|
||||
// 设置默认日期
|
||||
document.addEventListener('DOMContentLoaded', function() {
|
||||
const today = new Date();
|
||||
const oneMonthAgo = new Date(today.getFullYear(), today.getMonth() - 1, today.getDate());
|
||||
|
||||
document.getElementById('id_end_date').value = today.toISOString().split('T')[0];
|
||||
document.getElementById('id_start_date').value = oneMonthAgo.toISOString().split('T')[0];
|
||||
});
|
||||
</script>
|
||||
|
||||
<style>
|
||||
.form-row {
|
||||
margin-bottom: 15px;
|
||||
}
|
||||
|
||||
.form-row label {
|
||||
display: block;
|
||||
font-weight: bold;
|
||||
margin-bottom: 5px;
|
||||
}
|
||||
|
||||
.form-row input[type="text"],
|
||||
.form-row input[type="number"],
|
||||
.form-row input[type="date"] {
|
||||
padding: 5px;
|
||||
border: 1px solid #ddd;
|
||||
border-radius: 3px;
|
||||
}
|
||||
|
||||
.form-row .help {
|
||||
color: #666;
|
||||
font-size: 12px;
|
||||
margin-top: 3px;
|
||||
}
|
||||
|
||||
.submit-row {
|
||||
margin-top: 20px;
|
||||
padding-top: 20px;
|
||||
border-top: 1px solid #ddd;
|
||||
}
|
||||
|
||||
.submit-row input[type="submit"] {
|
||||
background: #417690;
|
||||
color: white;
|
||||
padding: 10px 20px;
|
||||
border: none;
|
||||
border-radius: 3px;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
.submit-row .cancel-link {
|
||||
margin-left: 10px;
|
||||
padding: 10px 20px;
|
||||
background: #f8f8f8;
|
||||
color: #333;
|
||||
text-decoration: none;
|
||||
border-radius: 3px;
|
||||
border: 1px solid #ddd;
|
||||
}
|
||||
|
||||
.submit-row .cancel-link:hover {
|
||||
background: #e8e8e8;
|
||||
}
|
||||
</style>
|
||||
{% endblock %}
|
||||
180
core/templates/admin/create_keyword_task.html
Normal file
180
core/templates/admin/create_keyword_task.html
Normal file
@@ -0,0 +1,180 @@
|
||||
{% extends "admin/base_site.html" %}
|
||||
{% load i18n admin_urls static admin_modify %}
|
||||
|
||||
{% block title %}{{ title }} | {{ site_title|default:_('Django site admin') }}{% endblock %}
|
||||
|
||||
{% block breadcrumbs %}
|
||||
<div class="breadcrumbs">
|
||||
<a href="{% url 'admin:index' %}">{% trans 'Home' %}</a>
|
||||
› <a href="{% url 'admin:core_crawltask_changelist' %}">爬取任务</a>
|
||||
› {{ title }}
|
||||
</div>
|
||||
{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<h1>{{ title }}</h1>
|
||||
|
||||
<form method="post" id="keyword-task-form">
|
||||
{% csrf_token %}
|
||||
|
||||
<fieldset class="module aligned">
|
||||
<h2>基本信息</h2>
|
||||
<div class="form-row">
|
||||
<div>
|
||||
<label for="id_name" class="required">任务名称:</label>
|
||||
<input type="text" name="name" id="id_name" required maxlength="200" style="width: 300px;">
|
||||
<p class="help">为这个爬取任务起一个容易识别的名称</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="form-row">
|
||||
<div>
|
||||
<label for="id_keyword" class="required">搜索关键词:</label>
|
||||
<input type="text" name="keyword" id="id_keyword" required maxlength="200" style="width: 300px;">
|
||||
<p class="help">输入要搜索的关键词,例如:人工智能、两会、政策等</p>
|
||||
</div>
|
||||
</div>
|
||||
</fieldset>
|
||||
|
||||
<fieldset class="module aligned">
|
||||
<h2>目标网站</h2>
|
||||
<div class="form-row">
|
||||
<div>
|
||||
<label>选择要爬取的网站:</label>
|
||||
<div style="max-height: 200px; overflow-y: auto; border: 1px solid #ddd; padding: 10px; margin-top: 5px;">
|
||||
<label style="display: block; margin: 5px 0;">
|
||||
<input type="checkbox" id="select_all" onchange="toggleAllWebsites()">
|
||||
<strong>全选/取消全选</strong>
|
||||
</label>
|
||||
<hr style="margin: 10px 0;">
|
||||
{% for website in websites %}
|
||||
<label style="display: block; margin: 3px 0;">
|
||||
<input type="checkbox" name="websites" value="{{ website.name }}" class="website-checkbox">
|
||||
{{ website.name }}
|
||||
</label>
|
||||
{% endfor %}
|
||||
</div>
|
||||
<p class="help">不选择任何网站将爬取所有支持的网站</p>
|
||||
</div>
|
||||
</div>
|
||||
</fieldset>
|
||||
|
||||
<fieldset class="module aligned">
|
||||
<h2>时间范围 (可选)</h2>
|
||||
<div class="form-row">
|
||||
<div>
|
||||
<label for="id_start_date">开始日期:</label>
|
||||
<input type="date" name="start_date" id="id_start_date">
|
||||
<p class="help">留空则搜索所有时间</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="form-row">
|
||||
<div>
|
||||
<label for="id_end_date">结束日期:</label>
|
||||
<input type="date" name="end_date" id="id_end_date">
|
||||
<p class="help">留空则搜索到当前时间</p>
|
||||
</div>
|
||||
</div>
|
||||
</fieldset>
|
||||
|
||||
<fieldset class="module aligned">
|
||||
<h2>爬取设置</h2>
|
||||
<div class="form-row">
|
||||
<div>
|
||||
<label for="id_max_pages">最大搜索页数:</label>
|
||||
<input type="number" name="max_pages" id="id_max_pages" value="10" min="1" max="100" style="width: 100px;">
|
||||
<p class="help">每个网站最多搜索的页数 (1-100)</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="form-row">
|
||||
<div>
|
||||
<label for="id_max_articles">最大文章数量:</label>
|
||||
<input type="number" name="max_articles" id="id_max_articles" value="100" min="1" max="1000" style="width: 100px;">
|
||||
<p class="help">总共最多爬取的文章数量 (1-1000)</p>
|
||||
</div>
|
||||
</div>
|
||||
</fieldset>
|
||||
|
||||
<div class="submit-row">
|
||||
<input type="submit" value="创建任务" class="default" name="_save">
|
||||
<a href="{% url 'admin:core_crawltask_changelist' %}" class="button cancel-link">取消</a>
|
||||
</div>
|
||||
</form>
|
||||
|
||||
<script>
|
||||
function toggleAllWebsites() {
|
||||
const selectAll = document.getElementById('select_all');
|
||||
const checkboxes = document.querySelectorAll('.website-checkbox');
|
||||
|
||||
checkboxes.forEach(checkbox => {
|
||||
checkbox.checked = selectAll.checked;
|
||||
});
|
||||
}
|
||||
|
||||
// 设置默认日期
|
||||
document.addEventListener('DOMContentLoaded', function() {
|
||||
const today = new Date();
|
||||
const oneMonthAgo = new Date(today.getFullYear(), today.getMonth() - 1, today.getDate());
|
||||
|
||||
document.getElementById('id_end_date').value = today.toISOString().split('T')[0];
|
||||
document.getElementById('id_start_date').value = oneMonthAgo.toISOString().split('T')[0];
|
||||
});
|
||||
</script>
|
||||
|
||||
<style>
|
||||
.form-row {
|
||||
margin-bottom: 15px;
|
||||
}
|
||||
|
||||
.form-row label {
|
||||
display: block;
|
||||
font-weight: bold;
|
||||
margin-bottom: 5px;
|
||||
}
|
||||
|
||||
.form-row input[type="text"],
|
||||
.form-row input[type="number"],
|
||||
.form-row input[type="date"] {
|
||||
padding: 5px;
|
||||
border: 1px solid #ddd;
|
||||
border-radius: 3px;
|
||||
}
|
||||
|
||||
.form-row .help {
|
||||
color: #666;
|
||||
font-size: 12px;
|
||||
margin-top: 3px;
|
||||
}
|
||||
|
||||
.submit-row {
|
||||
margin-top: 20px;
|
||||
padding-top: 20px;
|
||||
border-top: 1px solid #ddd;
|
||||
}
|
||||
|
||||
.submit-row input[type="submit"] {
|
||||
background: #417690;
|
||||
color: white;
|
||||
padding: 10px 20px;
|
||||
border: none;
|
||||
border-radius: 3px;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
.submit-row .cancel-link {
|
||||
margin-left: 10px;
|
||||
padding: 10px 20px;
|
||||
background: #f8f8f8;
|
||||
color: #333;
|
||||
text-decoration: none;
|
||||
border-radius: 3px;
|
||||
border: 1px solid #ddd;
|
||||
}
|
||||
|
||||
.submit-row .cancel-link:hover {
|
||||
background: #e8e8e8;
|
||||
}
|
||||
</style>
|
||||
{% endblock %}
|
||||
172
core/templates/admin/index.html
Normal file
172
core/templates/admin/index.html
Normal file
@@ -0,0 +1,172 @@
|
||||
{% extends "admin/base_site.html" %}
|
||||
{% load i18n static %}
|
||||
|
||||
{% block extrastyle %}{{ block.super }}<link rel="stylesheet" type="text/css" href="{% static "admin/css/dashboard.css" %}">{% endblock %}
|
||||
|
||||
{% block coltype %}colMS{% endblock %}
|
||||
|
||||
{% block bodyclass %}{{ block.super }} dashboard{% endblock %}
|
||||
|
||||
{% block breadcrumbs %}{% endblock %}
|
||||
|
||||
{% block nav-sidebar %}{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<div id="content-main">
|
||||
|
||||
{% if app_list %}
|
||||
{% for app in app_list %}
|
||||
<div class="app-{{ app.app_label }} module">
|
||||
<table>
|
||||
<caption>
|
||||
<a href="{{ app.app_url }}" class="section" title="{% blocktranslate with name=app.name %}Models in the {{ name }} application{% endblocktranslate %}">{{ app.name }}</a>
|
||||
</caption>
|
||||
{% for model in app.models %}
|
||||
<tr class="model-{{ model.object_name|lower }}">
|
||||
{% if model.admin_url %}
|
||||
<th scope="row"><a href="{{ model.admin_url }}"{% if model.add_url %} class="addlink"{% endif %}>{{ model.name }}</a></th>
|
||||
{% else %}
|
||||
<th scope="row">{{ model.name }}</th>
|
||||
{% endif %}
|
||||
|
||||
{% if model.add_url %}
|
||||
<td><a href="{{ model.add_url }}" class="addlink">{% translate 'Add' %}</a></td>
|
||||
{% else %}
|
||||
<td> </td>
|
||||
{% endif %}
|
||||
|
||||
{% if model.admin_url %}
|
||||
{% if model.view_only %}
|
||||
<td><a href="{{ model.admin_url }}" class="viewlink">{% translate 'View' %}</a></td>
|
||||
{% else %}
|
||||
<td><a href="{{ model.admin_url }}" class="changelink">{% translate 'Change' %}</a></td>
|
||||
{% endif %}
|
||||
{% else %}
|
||||
<td> </td>
|
||||
{% endif %}
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</table>
|
||||
</div>
|
||||
{% endfor %}
|
||||
{% else %}
|
||||
<p>{% translate "You don't have permission to view or edit anything." %}</p>
|
||||
{% endif %}
|
||||
|
||||
<!-- 自定义快速操作区域 -->
|
||||
<div class="module" style="margin-top: 20px;">
|
||||
<h2>快速创建爬取任务</h2>
|
||||
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 15px; margin-top: 15px;">
|
||||
<div style="border: 1px solid #ddd; padding: 15px; border-radius: 5px; text-align: center;">
|
||||
<h3 style="margin-top: 0; color: #417690;">关键词搜索</h3>
|
||||
<p style="color: #666; font-size: 14px;">根据关键词搜索并爬取相关文章</p>
|
||||
<a href="{% url 'admin:create_keyword_task' %}" class="button" style="background: #417690; color: white; padding: 8px 16px; text-decoration: none; border-radius: 3px; display: inline-block;">
|
||||
创建任务
|
||||
</a>
|
||||
</div>
|
||||
|
||||
<div style="border: 1px solid #ddd; padding: 15px; border-radius: 5px; text-align: center;">
|
||||
<h3 style="margin-top: 0; color: #28a745;">历史文章</h3>
|
||||
<p style="color: #666; font-size: 14px;">爬取指定日期范围的历史文章</p>
|
||||
<a href="{% url 'admin:create_historical_task' %}" class="button" style="background: #28a745; color: white; padding: 8px 16px; text-decoration: none; border-radius: 3px; display: inline-block;">
|
||||
创建任务
|
||||
</a>
|
||||
</div>
|
||||
|
||||
<div style="border: 1px solid #ddd; padding: 15px; border-radius: 5px; text-align: center;">
|
||||
<h3 style="margin-top: 0; color: #dc3545;">全站爬取</h3>
|
||||
<p style="color: #666; font-size: 14px;">爬取整个网站的所有文章</p>
|
||||
<a href="{% url 'admin:create_full_site_task' %}" class="button" style="background: #dc3545; color: white; padding: 8px 16px; text-decoration: none; border-radius: 3px; display: inline-block;">
|
||||
创建任务
|
||||
</a>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- 最近任务状态 -->
|
||||
<div class="module" style="margin-top: 20px;">
|
||||
<h2>最近任务状态</h2>
|
||||
<div style="margin-top: 15px;">
|
||||
{% load core_extras %}
|
||||
{% get_recent_tasks as recent_tasks %}
|
||||
{% if recent_tasks %}
|
||||
<table style="width: 100%;">
|
||||
<thead>
|
||||
<tr style="background: #f8f9fa;">
|
||||
<th style="padding: 8px; text-align: left;">任务名称</th>
|
||||
<th style="padding: 8px; text-align: left;">类型</th>
|
||||
<th style="padding: 8px; text-align: left;">状态</th>
|
||||
<th style="padding: 8px; text-align: left;">进度</th>
|
||||
<th style="padding: 8px; text-align: left;">创建时间</th>
|
||||
<th style="padding: 8px; text-align: left;">操作</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for task in recent_tasks %}
|
||||
<tr>
|
||||
<td style="padding: 8px;">{{ task.name }}</td>
|
||||
<td style="padding: 8px;">{{ task.get_task_type_display }}</td>
|
||||
<td style="padding: 8px;">
|
||||
<span style="color: {% if task.status == 'completed' %}green{% elif task.status == 'failed' %}red{% elif task.status == 'running' %}blue{% else %}gray{% endif %};">
|
||||
{{ task.get_status_display }}
|
||||
</span>
|
||||
</td>
|
||||
<td style="padding: 8px;">
|
||||
{% if task.status == 'running' %}
|
||||
<div style="width: 100px; background-color: #f0f0f0; border-radius: 3px; overflow: hidden;">
|
||||
<div style="width: {{ task.progress }}%; background-color: #4CAF50; height: 16px; text-align: center; line-height: 16px; color: white; font-size: 12px;">
|
||||
{{ task.progress }}%
|
||||
</div>
|
||||
</div>
|
||||
{% else %}
|
||||
-
|
||||
{% endif %}
|
||||
</td>
|
||||
<td style="padding: 8px;">{{ task.created_at|date:"m-d H:i" }}</td>
|
||||
<td style="padding: 8px;">
|
||||
<a href="{% url 'admin:core_crawltask_change' task.id %}" style="color: #417690; text-decoration: none;">查看</a>
|
||||
</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
{% else %}
|
||||
<p style="color: #666; text-align: center; padding: 20px;">暂无任务</p>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
{% endblock %}
|
||||
|
||||
{% block sidebar %}
|
||||
<div id="content-related">
|
||||
<div class="module" id="recent-actions-module">
|
||||
<h2>{% translate 'Recent actions' %}</h2>
|
||||
<h3>{% translate 'My actions' %}</h3>
|
||||
{% load log %}
|
||||
{% get_admin_log 10 as admin_log for_user user %}
|
||||
{% if not admin_log %}
|
||||
<p>{% translate 'None available' %}</p>
|
||||
{% else %}
|
||||
<ul class="actionlist">
|
||||
{% for entry in admin_log %}
|
||||
<li class="{% if entry.is_addition %}addlink{% endif %}{% if entry.is_change %}changelink{% endif %}{% if entry.is_deletion %}deletelink{% endif %}">
|
||||
{% if entry.is_deletion or not entry.get_admin_url %}
|
||||
{{ entry.object_repr }}
|
||||
{% else %}
|
||||
<a href="{{ entry.get_admin_url }}">{{ entry.object_repr }}</a>
|
||||
{% endif %}
|
||||
<br>
|
||||
{% if entry.content_type %}
|
||||
<span class="mini quiet">{% filter capfirst %}{{ entry.content_type.name }}{% endfilter %}</span>
|
||||
{% else %}
|
||||
<span class="mini quiet">{% translate 'Unknown content' %}</span>
|
||||
{% endif %}
|
||||
</li>
|
||||
{% endfor %}
|
||||
</ul>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
{% endblock %}
|
||||
184
core/templates/admin/task_results.html
Normal file
184
core/templates/admin/task_results.html
Normal file
@@ -0,0 +1,184 @@
|
||||
{% extends "admin/base_site.html" %}
|
||||
{% load i18n admin_urls static admin_modify %}
|
||||
|
||||
{% block title %}{{ title }} | {{ site_title|default:_('Django site admin') }}{% endblock %}
|
||||
|
||||
{% block breadcrumbs %}
|
||||
<div class="breadcrumbs">
|
||||
<a href="{% url 'admin:index' %}">{% trans 'Home' %}</a>
|
||||
› <a href="{% url 'admin:core_crawltask_changelist' %}">爬取任务</a>
|
||||
› {{ title }}
|
||||
</div>
|
||||
{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<h1>{{ title }}</h1>
|
||||
|
||||
<div class="results-summary" style="background: #f8f9fa; border: 1px solid #dee2e6; padding: 20px; margin-bottom: 20px; border-radius: 5px;">
|
||||
<h2>任务概览</h2>
|
||||
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px;">
|
||||
<div>
|
||||
<strong>任务名称:</strong><br>
|
||||
{{ task.name }}
|
||||
</div>
|
||||
<div>
|
||||
<strong>任务类型:</strong><br>
|
||||
{{ task.get_task_type_display }}
|
||||
</div>
|
||||
<div>
|
||||
<strong>状态:</strong><br>
|
||||
<span style="color: {% if task.status == 'completed' %}green{% elif task.status == 'failed' %}red{% elif task.status == 'running' %}blue{% else %}gray{% endif %};">
|
||||
{{ task.get_status_display }}
|
||||
</span>
|
||||
</div>
|
||||
<div>
|
||||
<strong>创建时间:</strong><br>
|
||||
{{ task.created_at|date:"Y-m-d H:i:s" }}
|
||||
</div>
|
||||
{% if task.started_at %}
|
||||
<div>
|
||||
<strong>开始时间:</strong><br>
|
||||
{{ task.started_at|date:"Y-m-d H:i:s" }}
|
||||
</div>
|
||||
{% endif %}
|
||||
{% if task.completed_at %}
|
||||
<div>
|
||||
<strong>完成时间:</strong><br>
|
||||
{{ task.completed_at|date:"Y-m-d H:i:s" }}
|
||||
</div>
|
||||
{% endif %}
|
||||
{% if task.get_duration %}
|
||||
<div>
|
||||
<strong>执行时长:</strong><br>
|
||||
{{ task.duration_display }}
|
||||
</div>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="results-stats" style="background: #fff; border: 1px solid #dee2e6; padding: 20px; margin-bottom: 20px; border-radius: 5px;">
|
||||
<h2>统计信息</h2>
|
||||
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); gap: 15px;">
|
||||
<div style="text-align: center; padding: 15px; background: #e3f2fd; border-radius: 5px;">
|
||||
<div style="font-size: 24px; font-weight: bold; color: #1976d2;">{{ task.total_articles }}</div>
|
||||
<div>总文章数</div>
|
||||
</div>
|
||||
<div style="text-align: center; padding: 15px; background: #e8f5e8; border-radius: 5px;">
|
||||
<div style="font-size: 24px; font-weight: bold; color: #388e3c;">{{ task.success_count }}</div>
|
||||
<div>成功数</div>
|
||||
</div>
|
||||
<div style="text-align: center; padding: 15px; background: #ffebee; border-radius: 5px;">
|
||||
<div style="font-size: 24px; font-weight: bold; color: #d32f2f;">{{ task.failed_count }}</div>
|
||||
<div>失败数</div>
|
||||
</div>
|
||||
{% if task.total_articles > 0 %}
|
||||
<div style="text-align: center; padding: 15px; background: #fff3e0; border-radius: 5px;">
|
||||
<div style="font-size: 24px; font-weight: bold; color: #f57c00;">
|
||||
{% widthratio task.success_count task.total_articles 100 %}%
|
||||
</div>
|
||||
<div>成功率</div>
|
||||
</div>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{% if task.keyword %}
|
||||
<div class="task-config" style="background: #fff; border: 1px solid #dee2e6; padding: 20px; margin-bottom: 20px; border-radius: 5px;">
|
||||
<h2>任务配置</h2>
|
||||
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px;">
|
||||
<div>
|
||||
<strong>搜索关键词:</strong><br>
|
||||
{{ task.keyword }}
|
||||
</div>
|
||||
<div>
|
||||
<strong>目标网站:</strong><br>
|
||||
{{ task.get_websites_display }}
|
||||
</div>
|
||||
{% if task.start_date %}
|
||||
<div>
|
||||
<strong>开始日期:</strong><br>
|
||||
{{ task.start_date }}
|
||||
</div>
|
||||
{% endif %}
|
||||
{% if task.end_date %}
|
||||
<div>
|
||||
<strong>结束日期:</strong><br>
|
||||
{{ task.end_date }}
|
||||
</div>
|
||||
{% endif %}
|
||||
<div>
|
||||
<strong>最大页数:</strong><br>
|
||||
{{ task.max_pages }}
|
||||
</div>
|
||||
<div>
|
||||
<strong>最大文章数:</strong><br>
|
||||
{{ task.max_articles }}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{% if task.current_website or task.current_action %}
|
||||
<div class="current-status" style="background: #fff; border: 1px solid #dee2e6; padding: 20px; margin-bottom: 20px; border-radius: 5px;">
|
||||
<h2>当前状态</h2>
|
||||
{% if task.current_website %}
|
||||
<div>
|
||||
<strong>当前网站:</strong> {{ task.current_website }}
|
||||
</div>
|
||||
{% endif %}
|
||||
{% if task.current_action %}
|
||||
<div>
|
||||
<strong>当前操作:</strong> {{ task.current_action }}
|
||||
</div>
|
||||
{% endif %}
|
||||
{% if task.status == 'running' %}
|
||||
<div style="margin-top: 10px;">
|
||||
<div style="width: 100%; background-color: #f0f0f0; border-radius: 10px; overflow: hidden;">
|
||||
<div style="width: {{ task.progress }}%; background-color: #4CAF50; height: 20px; text-align: center; line-height: 20px; color: white;">
|
||||
{{ task.progress }}%
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{% endif %}
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{% if task.error_message %}
|
||||
<div class="error-info" style="background: #ffebee; border: 1px solid #f44336; padding: 20px; margin-bottom: 20px; border-radius: 5px;">
|
||||
<h2 style="color: #d32f2f;">错误信息</h2>
|
||||
<pre style="white-space: pre-wrap; word-wrap: break-word;">{{ task.error_message }}</pre>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{% if task.result_details %}
|
||||
<div class="detailed-results" style="background: #fff; border: 1px solid #dee2e6; padding: 20px; margin-bottom: 20px; border-radius: 5px;">
|
||||
<h2>详细结果</h2>
|
||||
{% for website, result in task.result_details.items %}
|
||||
<div style="margin-bottom: 15px; padding: 10px; background: #f8f9fa; border-radius: 3px;">
|
||||
<strong>{{ website }}:</strong>
|
||||
<ul style="margin: 5px 0; padding-left: 20px;">
|
||||
<li>找到链接: {{ result.found_urls }}</li>
|
||||
<li>已处理: {{ result.processed }}</li>
|
||||
<li>成功: {{ result.success }}</li>
|
||||
<li>失败: {{ result.failed }}</li>
|
||||
{% if result.error %}
|
||||
<li style="color: red;">错误: {{ result.error }}</li>
|
||||
{% endif %}
|
||||
</ul>
|
||||
</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
<div class="actions" style="text-align: center; margin-top: 30px;">
|
||||
<a href="{% url 'admin:core_crawltask_changelist' %}" class="button" style="padding: 10px 20px; background: #417690; color: white; text-decoration: none; border-radius: 3px; margin-right: 10px;">
|
||||
返回任务列表
|
||||
</a>
|
||||
{% if task.status == 'completed' %}
|
||||
<a href="{% url 'admin:core_article_changelist' %}" class="button" style="padding: 10px 20px; background: #28a745; color: white; text-decoration: none; border-radius: 3px;">
|
||||
查看文章
|
||||
</a>
|
||||
{% endif %}
|
||||
</div>
|
||||
|
||||
{% endblock %}
|
||||
@@ -2,24 +2,25 @@
|
||||
<html lang="zh">
|
||||
<head>
|
||||
<meta charset="UTF-8"/>
|
||||
<title>{{ article.title }}</title>
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0"/>
|
||||
<title>{{ article.title }} - 绿色课堂</title>
|
||||
<style>
|
||||
body {
|
||||
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
|
||||
line-height: 1.6;
|
||||
color: #333;
|
||||
max-width: 1200px; /* 修改:同步调整页面最大宽度与列表页一致 */
|
||||
margin: 0 auto;
|
||||
padding: 20px;
|
||||
background-color: #f8f9fa;
|
||||
background-color: #f0f8ff;
|
||||
max-width: 800px;
|
||||
}
|
||||
|
||||
.article-container {
|
||||
.container {
|
||||
background: white;
|
||||
border-radius: 8px;
|
||||
box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
|
||||
padding: 30px;
|
||||
margin-bottom: 20px;
|
||||
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.05);
|
||||
border-radius: 8px;
|
||||
}
|
||||
|
||||
h1 {
|
||||
@@ -30,56 +31,68 @@
|
||||
}
|
||||
|
||||
.meta {
|
||||
color: #7f8c8d;
|
||||
color: #78909c;
|
||||
font-size: 0.9em;
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
|
||||
hr {
|
||||
border: 0;
|
||||
height: 1px;
|
||||
background: #ecf0f1;
|
||||
margin: 20px 0;
|
||||
}
|
||||
|
||||
.content {
|
||||
font-size: 16px;
|
||||
margin-top: 20px;
|
||||
}
|
||||
|
||||
.content img {
|
||||
/* 优化:确保图片和视频不会超出容器显示 */
|
||||
.content img, .content video {
|
||||
max-width: 100%;
|
||||
height: auto;
|
||||
border-radius: 4px;
|
||||
display: block;
|
||||
margin: 10px 0;
|
||||
}
|
||||
|
||||
/* 优化:确保iframe也不会超出容器显示 */
|
||||
.content iframe {
|
||||
max-width: 100%;
|
||||
height: auto;
|
||||
}
|
||||
|
||||
.back-link {
|
||||
display: inline-block;
|
||||
padding: 10px 20px;
|
||||
background-color: #3498db;
|
||||
color: white;
|
||||
margin-bottom: 20px;
|
||||
color: #1976d2;
|
||||
text-decoration: none;
|
||||
border-radius: 4px;
|
||||
transition: background-color 0.3s;
|
||||
}
|
||||
|
||||
.back-link:hover {
|
||||
background-color: #2980b9;
|
||||
color: #0d47a1;
|
||||
text-decoration: underline;
|
||||
}
|
||||
|
||||
@media (max-width: 600px) {
|
||||
body {
|
||||
padding: 10px;
|
||||
}
|
||||
|
||||
.container {
|
||||
padding: 15px;
|
||||
}
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="article-container">
|
||||
<div class="container">
|
||||
<a href="{% url 'article_list' %}" class="back-link">« 返回文章列表</a>
|
||||
|
||||
<h1>{{ article.title }}</h1>
|
||||
|
||||
<div class="meta">
|
||||
<p>发布时间: {{ article.pub_date|date:"Y-m-d H:i" }}</p>
|
||||
网站: {{ article.website.name }} |
|
||||
发布时间: {{ article.pub_date|date:"Y-m-d H:i" }} |
|
||||
创建时间: {{ article.created_at|date:"Y-m-d H:i" }} |
|
||||
源网址: <a href="{{ article.url }}" target="_blank">{{ article.url }}</a>
|
||||
</div>
|
||||
<hr/>
|
||||
|
||||
<div class="content">
|
||||
{{ article.content|safe }}
|
||||
</div>
|
||||
<hr/>
|
||||
<p><a href="{% url 'article_list' %}" class="back-link">← 返回列表</a></p>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
@@ -8,18 +8,17 @@
|
||||
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
|
||||
line-height: 1.6;
|
||||
color: #333;
|
||||
max-width: 1200px; /* 修改:增加页面最大宽度 */
|
||||
margin: 0 auto;
|
||||
padding: 20px;
|
||||
background-color: #f8f9fa;
|
||||
background-color: #f0f8ff; /* 统一背景色调 */
|
||||
}
|
||||
|
||||
.container {
|
||||
background: white;
|
||||
border-radius: 8px;
|
||||
box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
|
||||
padding: 30px;
|
||||
margin-bottom: 20px;
|
||||
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.05); /* 添加轻微阴影 */
|
||||
border-radius: 8px; /* 添加圆角 */
|
||||
}
|
||||
|
||||
h1 {
|
||||
@@ -32,7 +31,7 @@
|
||||
.filters {
|
||||
margin-bottom: 20px;
|
||||
padding: 15px;
|
||||
background-color: #f1f8ff;
|
||||
background-color: #e3f2fd; /* 统一滤镜背景色调 */
|
||||
border-radius: 5px;
|
||||
}
|
||||
|
||||
@@ -40,8 +39,8 @@
|
||||
display: inline-block;
|
||||
padding: 5px 10px;
|
||||
margin: 0 5px 5px 0;
|
||||
background-color: #e1e8ed;
|
||||
color: #333;
|
||||
background-color: #bbdefb; /* 统一链接背景色调 */
|
||||
color: #0d47a1;
|
||||
text-decoration: none;
|
||||
border-radius: 3px;
|
||||
}
|
||||
@@ -58,7 +57,7 @@
|
||||
|
||||
li {
|
||||
padding: 10px 0;
|
||||
border-bottom: 1px solid #ecf0f1;
|
||||
border-bottom: 1px solid #e0e0e0; /* 统一分隔线颜色 */
|
||||
}
|
||||
|
||||
li:last-child {
|
||||
@@ -66,17 +65,17 @@
|
||||
}
|
||||
|
||||
a {
|
||||
color: #3498db;
|
||||
color: #1976d2; /* 统一链接颜色 */
|
||||
text-decoration: none;
|
||||
}
|
||||
|
||||
a:hover {
|
||||
color: #2980b9;
|
||||
color: #0d47a1; /* 统一悬停颜色 */
|
||||
text-decoration: underline;
|
||||
}
|
||||
|
||||
.meta {
|
||||
color: #7f8c8d;
|
||||
color: #78909c; /* 统一元数据颜色 */
|
||||
font-size: 0.9em;
|
||||
}
|
||||
|
||||
@@ -117,23 +116,24 @@
|
||||
padding: 8px 4px;
|
||||
color: #7f8c8d;
|
||||
}
|
||||
|
||||
|
||||
/* 新增:搜索框样式 */
|
||||
.search-form {
|
||||
margin-bottom: 20px;
|
||||
padding: 15px;
|
||||
background-color: #f1f8ff;
|
||||
background-color: #e3f2fd; /* 统一搜索框背景色调 */
|
||||
border-radius: 5px;
|
||||
}
|
||||
|
||||
|
||||
.search-form input[type="text"] {
|
||||
padding: 8px 12px;
|
||||
border: 1px solid #ddd;
|
||||
border: 1px solid #bbdefb; /* 统一边框颜色 */
|
||||
border-radius: 4px;
|
||||
width: 300px;
|
||||
margin-right: 10px;
|
||||
background-color: #fff;
|
||||
}
|
||||
|
||||
|
||||
.search-form input[type="submit"] {
|
||||
padding: 8px 16px;
|
||||
background-color: #3498db;
|
||||
@@ -142,111 +142,438 @@
|
||||
border-radius: 4px;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
|
||||
.search-form input[type="submit"]:hover {
|
||||
background-color: #2980b9;
|
||||
}
|
||||
|
||||
|
||||
.search-info {
|
||||
color: #7f8c8d;
|
||||
color: #78909c; /* 统一搜索信息颜色 */
|
||||
font-size: 0.9em;
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
|
||||
/* 新增:左侧筛选栏样式 */
|
||||
.content-wrapper {
|
||||
display: flex;
|
||||
gap: 20px;
|
||||
}
|
||||
|
||||
.sidebar {
|
||||
flex: 0 0 200px;
|
||||
background-color: #e3f2fd; /* 统一边栏背景色调 */
|
||||
border-radius: 5px;
|
||||
padding: 15px;
|
||||
}
|
||||
|
||||
.main-content {
|
||||
flex: 1;
|
||||
}
|
||||
|
||||
.sidebar .filters {
|
||||
margin-bottom: 20px;
|
||||
padding: 0;
|
||||
background-color: transparent;
|
||||
}
|
||||
|
||||
.sidebar .filters strong {
|
||||
display: block;
|
||||
margin-bottom: 10px;
|
||||
color: #2c3e50;
|
||||
}
|
||||
|
||||
.sidebar .filters a {
|
||||
display: block;
|
||||
padding: 8px 10px;
|
||||
margin: 0 0 5px 0;
|
||||
background-color: #bbdefb; /* 统一边栏链接背景色调 */
|
||||
color: #0d47a1;
|
||||
text-decoration: none;
|
||||
border-radius: 3px;
|
||||
}
|
||||
|
||||
.sidebar .filters a.active {
|
||||
background-color: #3498db;
|
||||
color: white;
|
||||
}
|
||||
|
||||
/* 新增:导出功能样式 */
|
||||
.export-section {
|
||||
margin-bottom: 20px;
|
||||
padding: 15px;
|
||||
background-color: #e8f5e9; /* 统一导出区域背景色调 */
|
||||
border-radius: 5px;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.export-btn {
|
||||
padding: 10px 20px;
|
||||
background-color: #4caf50; /* 统一按钮背景色调 */
|
||||
color: white;
|
||||
border: none;
|
||||
border-radius: 4px;
|
||||
cursor: pointer;
|
||||
font-size: 16px;
|
||||
margin: 0 5px;
|
||||
}
|
||||
|
||||
.export-btn:hover {
|
||||
background-color: #388e3c; /* 统一按钮悬停色调 */
|
||||
}
|
||||
|
||||
.export-btn:disabled {
|
||||
background-color: #9e9e9e; /* 统一禁用按钮色调 */
|
||||
cursor: not-allowed;
|
||||
}
|
||||
|
||||
.article-checkbox {
|
||||
margin-right: 10px;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<h1>绿色课堂文章列表</h1>
|
||||
|
||||
<!-- 新增:返回首页链接 -->
|
||||
<div style="margin-bottom: 20px;">
|
||||
<a href="{% url 'article_list' %}" style="color: #3498db; text-decoration: none;">← 返回首页</a>
|
||||
</div>
|
||||
|
||||
<!-- 新增:搜索表单 -->
|
||||
<div class="search-form">
|
||||
<form method="get">
|
||||
<input type="text" name="q" placeholder="输入关键词搜索文章..." value="{{ search_query }}">
|
||||
{% if selected_website %}
|
||||
<input type="hidden" name="website" value="{{ selected_website.id }}">
|
||||
<input type="hidden" name="website" value="{{ selected_website.id }}">
|
||||
{% endif %}
|
||||
<input type="submit" value="搜索">
|
||||
</form>
|
||||
</div>
|
||||
|
||||
<div class="filters">
|
||||
<strong>按网站筛选:</strong>
|
||||
<a href="{% url 'article_list' %}{% if search_query %}?q={{ search_query }}{% endif %}" {% if not selected_website %}class="active" {% endif %}>全部</a>
|
||||
{% for website in websites %}
|
||||
<a href="?website={{ website.id }}{% if search_query %}&q={{ search_query }}{% endif %}" {% if selected_website and selected_website.id == website.id %}class="active" {% endif %}>{{ website.name }}</a>
|
||||
{% endfor %}
|
||||
</div>
|
||||
<div class="content-wrapper">
|
||||
<!-- 左侧筛选栏 -->
|
||||
<div class="sidebar">
|
||||
<div class="filters">
|
||||
<strong>按网站筛选:</strong>
|
||||
<a href="{% url 'article_list' %}{% if search_query %}?q={{ search_query }}{% endif %}"
|
||||
{% if not selected_website %}class="active" {% endif %}>全部</a>
|
||||
{% for website in websites %}
|
||||
<a href="?website={{ website.id }}{% if search_query %}&q={{ search_query }}{% endif %}"
|
||||
{% if selected_website and selected_website.id == website.id %}class="active" {% endif %}>{{ website.name }}</a>
|
||||
{% endfor %}
|
||||
</div>
|
||||
<!-- 修改:按媒体类型筛选 -->
|
||||
<div class="filters">
|
||||
<strong>按媒体类型筛选:</strong>
|
||||
<a href="?{% if selected_website %}website={{ selected_website.id }}&{% endif %}{% if search_query %}q={{ search_query }}&{% endif %}media_type=all"
|
||||
{% if not request.GET.media_type or request.GET.media_type == 'all' %}class="active"{% endif %}>全部</a>
|
||||
<a href="?{% if selected_website %}website={{ selected_website.id }}&{% endif %}{% if search_query %}q={{ search_query }}&{% endif %}media_type=text_only"
|
||||
{% if request.GET.media_type == 'text_only' %}class="active"{% endif %}>纯文本</a>
|
||||
<a href="?{% if selected_website %}website={{ selected_website.id }}&{% endif %}{% if search_query %}q={{ search_query }}&{% endif %}media_type=with_images"
|
||||
{% if request.GET.media_type == 'with_images' %}class="active"{% endif %}>图片</a>
|
||||
<a href="?{% if selected_website %}website={{ selected_website.id }}&{% endif %}{% if search_query %}q={{ search_query }}&{% endif %}media_type=with_videos"
|
||||
{% if request.GET.media_type == 'with_videos' %}class="active"{% endif %}>视频</a>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- 新增:搜索结果信息 -->
|
||||
{% if search_query %}
|
||||
<div class="search-info">
|
||||
搜索 "{{ search_query }}" 找到 {{ page_obj.paginator.count }} 篇文章
|
||||
<a href="{% if selected_website %}?website={{ selected_website.id }}{% else %}{% url 'article_list' %}{% endif %}">清除搜索</a>
|
||||
</div>
|
||||
{% endif %}
|
||||
<!-- 主内容区域 -->
|
||||
<div class="main-content">
|
||||
<!-- 新增:搜索结果信息 -->
|
||||
{% if search_query %}
|
||||
<div class="search-info">
|
||||
搜索 "{{ search_query }}" 找到 {{ page_obj.paginator.count }} 篇文章
|
||||
<a href="{% if selected_website %}?website={{ selected_website.id }}{% else %}{% url 'article_list' %}{% endif %}">清除搜索</a>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
<ul>
|
||||
{% for article in page_obj %}
|
||||
<li>
|
||||
<a href="{% url 'article_detail' article.id %}">{{ article.title }}</a>
|
||||
<div class="meta">({{ article.website.name }} - {{ article.created_at|date:"Y-m-d" }})</div>
|
||||
</li>
|
||||
{% empty %}
|
||||
<li>暂无文章</li>
|
||||
{% endfor %}
|
||||
</ul>
|
||||
<!-- 新增:导出功能 -->
|
||||
<div class="export-section">
|
||||
<button id="selectAllBtn" class="export-btn">全选</button>
|
||||
<button id="deselectAllBtn" class="export-btn">取消全选</button>
|
||||
<button id="exportJsonBtn" class="export-btn" disabled>导出为JSON</button>
|
||||
<button id="exportCsvBtn" class="export-btn" disabled>导出为CSV</button>
|
||||
<!-- 新增:导出为ZIP包按钮 -->
|
||||
<button id="exportZipBtn" class="export-btn" disabled>导出为ZIP包</button>
|
||||
<!-- 删除:按类型导出按钮 -->
|
||||
<!-- <button id="exportTextOnlyBtn" class="export-btn">导出纯文本</button>
|
||||
<button id="exportWithImagesBtn" class="export-btn">导出含图片</button>
|
||||
<button id="exportWithVideosBtn" class="export-btn">导出含视频</button> -->
|
||||
</div>
|
||||
|
||||
<div class="pagination">
|
||||
{% if page_obj.has_previous %}
|
||||
{% if selected_website %}
|
||||
<a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page=1">« 首页</a>
|
||||
<a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.previous_page_number }}">上一页</a>
|
||||
{% else %}
|
||||
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page=1">« 首页</a>
|
||||
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.previous_page_number }}">上一页</a>
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
<ul>
|
||||
{% for article in page_obj %}
|
||||
<li>
|
||||
<input type="checkbox" class="article-checkbox" value="{{ article.id }}"
|
||||
id="article_{{ article.id }}">
|
||||
<a href="{% url 'article_detail' article.id %}">{{ article.title }}</a>
|
||||
<div class="meta">({{ article.website.name }} - {{ article.created_at|date:"Y-m-d" }})</div>
|
||||
</li>
|
||||
{% empty %}
|
||||
<li>暂无文章</li>
|
||||
{% endfor %}
|
||||
</ul>
|
||||
|
||||
<span>第 {{ page_obj.number }} 页,共 {{ page_obj.paginator.num_pages }} 页</span>
|
||||
<div class="pagination">
|
||||
{% if page_obj.has_previous %}
|
||||
{% if selected_website %}
|
||||
<a href="?website=
|
||||
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page=1">«
|
||||
首页</a>
|
||||
<a href="?website=
|
||||
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.previous_page_number }}">上一页</a>
|
||||
{% else %}
|
||||
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page=1">« 首页</a>
|
||||
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.previous_page_number }}">上一页</a>
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
|
||||
<!-- 修改:优化页码显示逻辑 -->
|
||||
{% with page_obj.paginator as paginator %}
|
||||
{% for num in paginator.page_range %}
|
||||
{% if page_obj.number == num %}
|
||||
<a href="#" class="current">{{ num }}</a>
|
||||
{% elif num > page_obj.number|add:'-3' and num < page_obj.number|add:'3' %}
|
||||
{% if selected_website %}
|
||||
<a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ num }}">{{ num }}</a>
|
||||
{% else %}
|
||||
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ num }}">{{ num }}</a>
|
||||
{% endif %}
|
||||
{% elif num == 1 or num == paginator.num_pages %}
|
||||
{% if selected_website %}
|
||||
<a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ num }}">{{ num }}</a>
|
||||
{% else %}
|
||||
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ num }}">{{ num }}</a>
|
||||
{% endif %}
|
||||
{% elif num == page_obj.number|add:'-3' or num == page_obj.number|add:'3' %}
|
||||
<span class="ellipsis">...</span>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endwith %}
|
||||
<span>第 {{ page_obj.number }} 页,共 {{ page_obj.paginator.num_pages }} 页</span>
|
||||
|
||||
{% if page_obj.has_next %}
|
||||
{% if selected_website %}
|
||||
<a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.next_page_number }}">下一页</a>
|
||||
<a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.paginator.num_pages }}">末页 »</a>
|
||||
{% else %}
|
||||
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.next_page_number }}">下一页</a>
|
||||
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.paginator.num_pages }}">末页 »</a>
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
<!-- 修改:优化页码显示逻辑 -->
|
||||
{% with page_obj.paginator as paginator %}
|
||||
{% for num in paginator.page_range %}
|
||||
{% if page_obj.number == num %}
|
||||
<a href="#" class="current">{{ num }}</a>
|
||||
{% elif num > page_obj.number|add:'-3' and num < page_obj.number|add:'3' %}
|
||||
{% if selected_website %}
|
||||
<a href="?website=
|
||||
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ num }}">{{ num }}</a>
|
||||
{% else %}
|
||||
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ num }}">{{ num }}</a>
|
||||
{% endif %}
|
||||
{% elif num == 1 or num == paginator.num_pages %}
|
||||
{% if selected_website %}
|
||||
<a href="?website=
|
||||
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ num }}">{{ num }}</a>
|
||||
{% else %}
|
||||
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ num }}">{{ num }}</a>
|
||||
{% endif %}
|
||||
{% elif num == page_obj.number|add:'-3' or num == page_obj.number|add:'3' %}
|
||||
<span class="ellipsis">...</span>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endwith %}
|
||||
|
||||
{% if page_obj.has_next %}
|
||||
{% if selected_website %}
|
||||
<a href="?website=
|
||||
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.next_page_number }}">下一页</a>
|
||||
<a href="?website=
|
||||
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.paginator.num_pages }}">末页
|
||||
»</a>
|
||||
{% else %}
|
||||
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.next_page_number }}">下一页</a>
|
||||
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.paginator.num_pages }}">末页
|
||||
»</a>
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
// 导出功能相关JavaScript
|
||||
const checkboxes = document.querySelectorAll('.article-checkbox');
|
||||
const exportJsonBtn = document.getElementById('exportJsonBtn');
|
||||
const exportCsvBtn = document.getElementById('exportCsvBtn');
|
||||
const selectAllBtn = document.getElementById('selectAllBtn');
|
||||
const deselectAllBtn = document.getElementById('deselectAllBtn');
|
||||
// 新增:获取ZIP导出按钮元素
|
||||
const exportZipBtn = document.getElementById('exportZipBtn');
|
||||
|
||||
// const exportTextOnlyBtn = document.getElementById('exportTextOnlyBtn');
|
||||
// const exportWithImagesBtn = document.getElementById('exportWithImagesBtn');
|
||||
// const exportWithVideosBtn = document.getElementById('exportWithVideosBtn');
|
||||
|
||||
// 更新导出按钮状态
|
||||
function updateExportButtons() {
|
||||
const selectedCount = document.querySelectorAll('.article-checkbox:checked').length;
|
||||
exportJsonBtn.disabled = selectedCount === 0;
|
||||
exportCsvBtn.disabled = selectedCount === 0;
|
||||
exportZipBtn.disabled = selectedCount === 0; // 新增:更新ZIP导出按钮状态
|
||||
}
|
||||
|
||||
// 为所有复选框添加事件监听器
|
||||
checkboxes.forEach(checkbox => {
|
||||
checkbox.addEventListener('change', updateExportButtons);
|
||||
});
|
||||
|
||||
// 全选功能
|
||||
selectAllBtn.addEventListener('click', () => {
|
||||
checkboxes.forEach(checkbox => {
|
||||
checkbox.checked = true;
|
||||
});
|
||||
updateExportButtons();
|
||||
});
|
||||
|
||||
// 取消全选功能
|
||||
deselectAllBtn.addEventListener('click', () => {
|
||||
checkboxes.forEach(checkbox => {
|
||||
checkbox.checked = false;
|
||||
});
|
||||
updateExportButtons();
|
||||
});
|
||||
|
||||
// 导出为JSON功能
|
||||
exportJsonBtn.addEventListener('click', () => {
|
||||
const selectedArticles = Array.from(document.querySelectorAll('.article-checkbox:checked'))
|
||||
.map(checkbox => checkbox.value);
|
||||
|
||||
// 发送POST请求导出文章
|
||||
fetch('{% url "export_articles" %}', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
'X-CSRFToken': '{{ csrf_token }}'
|
||||
},
|
||||
body: JSON.stringify({
|
||||
article_ids: selectedArticles,
|
||||
format: 'json'
|
||||
})
|
||||
})
|
||||
.then(response => {
|
||||
if (response.ok) {
|
||||
return response.blob();
|
||||
}
|
||||
throw new Error('导出失败');
|
||||
})
|
||||
.then(blob => {
|
||||
const url = window.URL.createObjectURL(blob);
|
||||
const a = document.createElement('a');
|
||||
a.href = url;
|
||||
a.download = 'articles.json';
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
window.URL.revokeObjectURL(url);
|
||||
document.body.removeChild(a);
|
||||
})
|
||||
.catch(error => {
|
||||
alert('导出失败: ' + error);
|
||||
});
|
||||
});
|
||||
|
||||
// 导出为CSV功能
|
||||
exportCsvBtn.addEventListener('click', () => {
|
||||
const selectedArticles = Array.from(document.querySelectorAll('.article-checkbox:checked'))
|
||||
.map(checkbox => checkbox.value);
|
||||
|
||||
// 发送POST请求导出文章
|
||||
fetch('{% url "export_articles" %}', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
'X-CSRFToken': '{{ csrf_token }}'
|
||||
},
|
||||
body: JSON.stringify({
|
||||
article_ids: selectedArticles,
|
||||
format: 'csv'
|
||||
})
|
||||
})
|
||||
.then(response => {
|
||||
if (response.ok) {
|
||||
return response.blob();
|
||||
}
|
||||
throw new Error('导出失败');
|
||||
})
|
||||
.then(blob => {
|
||||
const url = window.URL.createObjectURL(blob);
|
||||
const a = document.createElement('a');
|
||||
a.href = url;
|
||||
a.download = 'articles.csv';
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
window.URL.revokeObjectURL(url);
|
||||
document.body.removeChild(a);
|
||||
})
|
||||
.catch(error => {
|
||||
alert('导出失败: ' + error);
|
||||
});
|
||||
});
|
||||
|
||||
// 新增:导出为ZIP包功能
|
||||
exportZipBtn.addEventListener('click', () => {
|
||||
const selectedArticles = Array.from(document.querySelectorAll('.article-checkbox:checked'))
|
||||
.map(checkbox => checkbox.value);
|
||||
|
||||
// 发送POST请求导出文章为ZIP包
|
||||
fetch('{% url "export_articles" %}', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
'X-CSRFToken': '{{ csrf_token }}'
|
||||
},
|
||||
body: JSON.stringify({
|
||||
article_ids: selectedArticles,
|
||||
format: 'zip' // 指定导出格式为ZIP
|
||||
})
|
||||
})
|
||||
.then(response => {
|
||||
if (response.ok) {
|
||||
return response.blob();
|
||||
}
|
||||
throw new Error('导出失败');
|
||||
})
|
||||
.then(blob => {
|
||||
const url = window.URL.createObjectURL(blob);
|
||||
const a = document.createElement('a');
|
||||
a.href = url;
|
||||
a.download = 'articles.zip';
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
window.URL.revokeObjectURL(url);
|
||||
document.body.removeChild(a);
|
||||
})
|
||||
.catch(error => {
|
||||
alert('导出失败: ' + error);
|
||||
});
|
||||
});
|
||||
|
||||
// exportTextOnlyBtn.addEventListener('click', () => {
|
||||
// exportByMediaType('text_only');
|
||||
// });
|
||||
|
||||
// exportWithImagesBtn.addEventListener('click', () => {
|
||||
// exportByMediaType('with_images');
|
||||
// });
|
||||
|
||||
// exportWithVideosBtn.addEventListener('click', () => {
|
||||
// exportByMediaType('with_videos');
|
||||
// });
|
||||
|
||||
// function exportByMediaType(mediaType) {
|
||||
// // 发送POST请求按类型导出文章
|
||||
// fetch('{% url "export_articles_by_type" %}', {
|
||||
// method: 'POST',
|
||||
// headers: {
|
||||
// 'Content-Type': 'application/json',
|
||||
// 'X-CSRFToken': '{{ csrf_token }}'
|
||||
// },
|
||||
// body: JSON.stringify({
|
||||
// media_type: mediaType,
|
||||
// format: 'zip'
|
||||
// })
|
||||
// })
|
||||
// .then(response => {
|
||||
// if (response.ok) {
|
||||
// return response.blob();
|
||||
// }
|
||||
// throw new Error('导出失败');
|
||||
// })
|
||||
// .then(blob => {
|
||||
// const url = window.URL.createObjectURL(blob);
|
||||
// const a = document.createElement('a');
|
||||
// a.href = url;
|
||||
// a.download = `articles_${mediaType}.zip`;
|
||||
// document.body.appendChild(a);
|
||||
// a.click();
|
||||
// window.URL.revokeObjectURL(url);
|
||||
// document.body.removeChild(a);
|
||||
// })
|
||||
// .catch(error => {
|
||||
// alert('导出失败: ' + error);
|
||||
// });
|
||||
// }
|
||||
|
||||
// 初始化导出按钮状态
|
||||
updateExportButtons();
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
</html>
|
||||
|
||||
|
||||
0
core/templatetags/__init__.py
Normal file
0
core/templatetags/__init__.py
Normal file
46
core/templatetags/core_extras.py
Normal file
46
core/templatetags/core_extras.py
Normal file
@@ -0,0 +1,46 @@
|
||||
from django import template
|
||||
from django.core.cache import cache
|
||||
from core.models import CrawlTask
|
||||
|
||||
register = template.Library()
|
||||
|
||||
|
||||
@register.simple_tag
|
||||
def get_recent_tasks(limit=5):
|
||||
"""获取最近的任务"""
|
||||
cache_key = f'recent_tasks_{limit}'
|
||||
recent_tasks = cache.get(cache_key)
|
||||
|
||||
if recent_tasks is None:
|
||||
recent_tasks = CrawlTask.objects.all()[:limit]
|
||||
cache.set(cache_key, recent_tasks, 60) # 缓存1分钟
|
||||
|
||||
return recent_tasks
|
||||
|
||||
|
||||
@register.filter
|
||||
def task_status_color(status):
|
||||
"""根据任务状态返回颜色"""
|
||||
color_map = {
|
||||
'pending': 'gray',
|
||||
'running': 'blue',
|
||||
'completed': 'green',
|
||||
'failed': 'red',
|
||||
'cancelled': 'orange',
|
||||
}
|
||||
return color_map.get(status, 'gray')
|
||||
|
||||
|
||||
@register.filter
|
||||
def task_progress_bar(progress):
|
||||
"""生成进度条HTML"""
|
||||
if progress is None:
|
||||
progress = 0
|
||||
|
||||
return f'''
|
||||
<div style="width: 100px; background-color: #f0f0f0; border-radius: 3px; overflow: hidden;">
|
||||
<div style="width: {progress}%; background-color: #4CAF50; height: 16px; text-align: center; line-height: 16px; color: white; font-size: 12px;">
|
||||
{progress}%
|
||||
</div>
|
||||
</div>
|
||||
'''
|
||||
313
core/tests.py
313
core/tests.py
@@ -1,3 +1,312 @@
|
||||
from django.test import TestCase
|
||||
import os
|
||||
import tempfile
|
||||
import shutil
|
||||
from django.test import TestCase, override_settings
|
||||
from django.core.management import call_command
|
||||
from django.core.management.base import CommandError
|
||||
from django.utils import timezone
|
||||
from django.core.files.uploadedfile import SimpleUploadedFile
|
||||
from unittest.mock import patch, MagicMock
|
||||
from .models import Website, Article
|
||||
from .utils import process_article, download_media, is_valid_url, full_site_crawler
|
||||
from .tasks import crawl_website, crawl_all_websites, health_check
|
||||
|
||||
# Create your tests here.
|
||||
|
||||
class WebsiteModelTest(TestCase):
|
||||
"""网站模型测试"""
|
||||
|
||||
def setUp(self):
|
||||
self.website = Website.objects.create(
|
||||
name='测试网站',
|
||||
base_url='https://test.com',
|
||||
description='测试描述'
|
||||
)
|
||||
|
||||
def test_website_creation(self):
|
||||
"""测试网站创建"""
|
||||
self.assertEqual(self.website.name, '测试网站')
|
||||
self.assertEqual(self.website.base_url, 'https://test.com')
|
||||
self.assertTrue(self.website.enabled)
|
||||
|
||||
def test_website_str(self):
|
||||
"""测试网站字符串表示"""
|
||||
self.assertEqual(str(self.website), '测试网站')
|
||||
|
||||
|
||||
class ArticleModelTest(TestCase):
|
||||
"""文章模型测试"""
|
||||
|
||||
def setUp(self):
|
||||
self.website = Website.objects.create(
|
||||
name='测试网站',
|
||||
base_url='https://test.com'
|
||||
)
|
||||
self.article = Article.objects.create(
|
||||
website=self.website,
|
||||
title='测试文章',
|
||||
url='https://test.com/article/1',
|
||||
content='<p>测试内容</p>',
|
||||
media_files=['image1.jpg', 'image2.jpg']
|
||||
)
|
||||
|
||||
def test_article_creation(self):
|
||||
"""测试文章创建"""
|
||||
self.assertEqual(self.article.title, '测试文章')
|
||||
self.assertEqual(self.article.url, 'https://test.com/article/1')
|
||||
self.assertEqual(len(self.article.media_files), 2)
|
||||
|
||||
def test_article_str(self):
|
||||
"""测试文章字符串表示"""
|
||||
self.assertEqual(str(self.article), '测试文章')
|
||||
|
||||
|
||||
class UtilsTest(TestCase):
|
||||
"""工具函数测试"""
|
||||
|
||||
def setUp(self):
|
||||
self.website = Website.objects.create(
|
||||
name='测试网站',
|
||||
base_url='https://test.com'
|
||||
)
|
||||
self.temp_dir = tempfile.mkdtemp()
|
||||
|
||||
def tearDown(self):
|
||||
shutil.rmtree(self.temp_dir)
|
||||
|
||||
def test_is_valid_url(self):
|
||||
"""测试URL验证"""
|
||||
from .utils import is_valid_url
|
||||
|
||||
# 有效URL
|
||||
self.assertTrue(is_valid_url('https://test.com/article', 'test.com'))
|
||||
self.assertTrue(is_valid_url('http://test.com/article', 'test.com'))
|
||||
|
||||
# 无效URL
|
||||
self.assertFalse(is_valid_url('https://other.com/article', 'test.com'))
|
||||
self.assertFalse(is_valid_url('ftp://test.com/article', 'test.com'))
|
||||
self.assertFalse(is_valid_url('invalid-url', 'test.com'))
|
||||
|
||||
@patch('core.utils.requests.get')
|
||||
def test_download_media(self, mock_get):
|
||||
"""测试媒体下载"""
|
||||
# 模拟响应
|
||||
mock_response = MagicMock()
|
||||
mock_response.content = b'fake image content'
|
||||
mock_response.headers = {'content-type': 'image/jpeg'}
|
||||
mock_get.return_value = mock_response
|
||||
|
||||
# 测试下载
|
||||
result = download_media('https://test.com/image.jpg', self.temp_dir)
|
||||
self.assertIsNotNone(result)
|
||||
self.assertTrue(os.path.exists(result))
|
||||
|
||||
@patch('core.utils.requests.get')
|
||||
@patch('core.utils.download_media')
|
||||
def test_process_article_success(self, mock_download_media, mock_get):
|
||||
"""测试文章处理成功"""
|
||||
# 模拟HTML响应
|
||||
html_content = '''
|
||||
<html>
|
||||
<head><title>测试文章</title></head>
|
||||
<body>
|
||||
<h1>测试文章标题</h1>
|
||||
<div class="content">
|
||||
<p>测试文章内容</p>
|
||||
<img src="https://test.com/image.jpg">
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
|
||||
mock_response = MagicMock()
|
||||
mock_response.text = html_content
|
||||
mock_response.encoding = 'utf-8'
|
||||
mock_response.raise_for_status.return_value = None
|
||||
mock_get.return_value = mock_response
|
||||
|
||||
# 模拟媒体下载
|
||||
mock_download_media.return_value = '/tmp/test_image.jpg'
|
||||
|
||||
# 测试文章处理
|
||||
process_article('https://test.com/article/1', self.website)
|
||||
|
||||
# 验证文章是否保存
|
||||
article = Article.objects.filter(url='https://test.com/article/1').first()
|
||||
self.assertIsNotNone(article)
|
||||
self.assertEqual(article.title, '测试文章标题')
|
||||
|
||||
|
||||
class ManagementCommandsTest(TestCase):
|
||||
"""管理命令测试"""
|
||||
|
||||
def setUp(self):
|
||||
self.website = Website.objects.create(
|
||||
name='测试网站',
|
||||
base_url='https://test.com'
|
||||
)
|
||||
|
||||
@patch('core.management.commands.crawl_all_media.call_command')
|
||||
def test_crawl_all_media_command(self, mock_call_command):
|
||||
"""测试批量爬取命令"""
|
||||
# 模拟命令执行
|
||||
mock_call_command.return_value = None
|
||||
|
||||
# 执行命令
|
||||
call_command('crawl_all_media', media='rmrb,xinhua')
|
||||
|
||||
# 验证命令被调用
|
||||
mock_call_command.assert_called()
|
||||
|
||||
|
||||
class CeleryTasksTest(TestCase):
|
||||
"""Celery任务测试"""
|
||||
|
||||
def setUp(self):
|
||||
self.website = Website.objects.create(
|
||||
name='测试网站',
|
||||
base_url='https://test.com'
|
||||
)
|
||||
|
||||
@patch('core.tasks.full_site_crawler')
|
||||
def test_crawl_website_task(self, mock_crawler):
|
||||
"""测试单个网站爬取任务"""
|
||||
# 模拟爬虫函数
|
||||
mock_crawler.return_value = None
|
||||
|
||||
# 执行任务
|
||||
result = crawl_website(self.website.id)
|
||||
|
||||
# 验证结果
|
||||
self.assertEqual(result['website_id'], self.website.id)
|
||||
self.assertEqual(result['website_name'], '测试网站')
|
||||
self.assertEqual(result['status'], 'success')
|
||||
|
||||
def test_crawl_website_task_invalid_id(self):
|
||||
"""测试无效网站ID的任务"""
|
||||
# 执行任务
|
||||
with self.assertRaises(Exception):
|
||||
crawl_website(99999)
|
||||
|
||||
@patch('core.tasks.crawl_website.delay')
|
||||
def test_crawl_all_websites_task(self, mock_delay):
|
||||
"""测试批量爬取任务"""
|
||||
# 模拟子任务
|
||||
mock_result = MagicMock()
|
||||
mock_result.id = 'task-123'
|
||||
mock_delay.return_value = mock_result
|
||||
|
||||
# 执行任务
|
||||
result = crawl_all_websites()
|
||||
|
||||
# 验证结果
|
||||
self.assertEqual(result['total_websites'], 1)
|
||||
self.assertEqual(result['status'], 'started')
|
||||
|
||||
def test_health_check_task(self):
|
||||
"""测试健康检查任务"""
|
||||
# 执行任务
|
||||
result = health_check()
|
||||
|
||||
# 验证结果
|
||||
self.assertEqual(result['database'], 'ok')
|
||||
self.assertEqual(result['website_count'], 1)
|
||||
self.assertEqual(result['article_count'], 0)
|
||||
|
||||
|
||||
class IntegrationTest(TestCase):
|
||||
"""集成测试"""
|
||||
|
||||
def setUp(self):
|
||||
self.website = Website.objects.create(
|
||||
name='集成测试网站',
|
||||
base_url='https://integration-test.com'
|
||||
)
|
||||
|
||||
def test_full_workflow(self):
|
||||
"""测试完整工作流程"""
|
||||
# 1. 创建网站
|
||||
self.assertEqual(Website.objects.count(), 1)
|
||||
|
||||
# 2. 创建文章
|
||||
article = Article.objects.create(
|
||||
website=self.website,
|
||||
title='集成测试文章',
|
||||
url='https://integration-test.com/article/1',
|
||||
content='<p>集成测试内容</p>'
|
||||
)
|
||||
|
||||
# 3. 验证关联关系
|
||||
self.assertEqual(article.website, self.website)
|
||||
self.assertEqual(self.website.article_set.count(), 1)
|
||||
|
||||
# 4. 验证数据完整性
|
||||
self.assertIsNotNone(article.created_at)
|
||||
self.assertIsInstance(article.media_files, list)
|
||||
|
||||
|
||||
@override_settings(MEDIA_ROOT=tempfile.mkdtemp())
|
||||
class MediaHandlingTest(TestCase):
|
||||
"""媒体文件处理测试"""
|
||||
|
||||
def setUp(self):
|
||||
self.website = Website.objects.create(
|
||||
name='媒体测试网站',
|
||||
base_url='https://media-test.com'
|
||||
)
|
||||
|
||||
def test_media_files_field(self):
|
||||
"""测试媒体文件字段"""
|
||||
article = Article.objects.create(
|
||||
website=self.website,
|
||||
title='媒体测试文章',
|
||||
url='https://media-test.com/article/1',
|
||||
content='<p>测试内容</p>',
|
||||
media_files=['image1.jpg', 'video1.mp4']
|
||||
)
|
||||
|
||||
# 验证媒体文件列表
|
||||
self.assertEqual(len(article.media_files), 2)
|
||||
self.assertIn('image1.jpg', article.media_files)
|
||||
self.assertIn('video1.mp4', article.media_files)
|
||||
|
||||
|
||||
class ErrorHandlingTest(TestCase):
|
||||
"""错误处理测试"""
|
||||
|
||||
def test_duplicate_url_handling(self):
|
||||
"""测试重复URL处理"""
|
||||
website = Website.objects.create(
|
||||
name='错误测试网站',
|
||||
base_url='https://error-test.com'
|
||||
)
|
||||
|
||||
# 创建第一篇文章
|
||||
article1 = Article.objects.create(
|
||||
website=website,
|
||||
title='第一篇文章',
|
||||
url='https://error-test.com/article/1',
|
||||
content='<p>内容1</p>'
|
||||
)
|
||||
|
||||
# 尝试创建相同URL的文章
|
||||
with self.assertRaises(Exception):
|
||||
Article.objects.create(
|
||||
website=website,
|
||||
title='第二篇文章',
|
||||
url='https://error-test.com/article/1', # 相同URL
|
||||
content='<p>内容2</p>'
|
||||
)
|
||||
|
||||
def test_invalid_website_data(self):
|
||||
"""测试无效网站数据"""
|
||||
# 测试重复名称(unique约束)
|
||||
Website.objects.create(
|
||||
name='测试网站1',
|
||||
base_url='https://test1.com'
|
||||
)
|
||||
|
||||
with self.assertRaises(Exception):
|
||||
Website.objects.create(
|
||||
name='测试网站1', # 重复名称
|
||||
base_url='https://test2.com'
|
||||
)
|
||||
|
||||
@@ -2,9 +2,11 @@ from django.urls import path
|
||||
from . import views
|
||||
|
||||
urlpatterns = [
|
||||
# 主页,文章列表
|
||||
path('', views.article_list, name='article_list'),
|
||||
# 文章详情
|
||||
path('article/<int:article_id>/', views.article_detail, name='article_detail'),
|
||||
# 后续可以加更多路径
|
||||
path('run-crawler/', views.run_crawler, name='run_crawler'),
|
||||
path('crawler-status/', views.crawler_status, name='crawler_status'),
|
||||
path('pause-crawler/', views.pause_crawler, name='pause_crawler'),
|
||||
path('export-articles/', views.export_articles, name='export_articles'),
|
||||
path('export-articles-by-type/', views.export_articles_by_type, name='export_articles_by_type'),
|
||||
]
|
||||
|
||||
3145
core/utils.py
3145
core/utils.py
File diff suppressed because it is too large
Load Diff
610
core/views.py
610
core/views.py
@@ -1,15 +1,30 @@
|
||||
import uuid
|
||||
from django.shortcuts import render
|
||||
from django.core.paginator import Paginator
|
||||
from django.http import JsonResponse
|
||||
from django.views.decorators.http import require_http_methods
|
||||
from django.core.management import call_command
|
||||
from .models import Article, Website
|
||||
import threading
|
||||
from django.http import HttpResponse
|
||||
import json
|
||||
import csv
|
||||
from django.views.decorators.csrf import csrf_exempt
|
||||
from django.utils import timezone
|
||||
|
||||
# 用于跟踪爬虫任务状态的全局字典
|
||||
crawler_tasks = {}
|
||||
|
||||
|
||||
def article_list(request):
|
||||
# 获取所有启用的网站
|
||||
websites = Website.objects.filter(enabled=True)
|
||||
|
||||
|
||||
# 获取筛选网站
|
||||
selected_website = None
|
||||
# 修改:确保始终获取所有文章,除非有特定筛选
|
||||
articles = Article.objects.all()
|
||||
|
||||
|
||||
website_id = request.GET.get('website')
|
||||
if website_id:
|
||||
try:
|
||||
@@ -17,28 +32,605 @@ def article_list(request):
|
||||
articles = articles.filter(website=selected_website)
|
||||
except Website.DoesNotExist:
|
||||
pass
|
||||
|
||||
# 新增:处理关键词搜索
|
||||
|
||||
# 处理关键词搜索
|
||||
search_query = request.GET.get('q')
|
||||
if search_query:
|
||||
articles = articles.filter(title__icontains=search_query)
|
||||
|
||||
|
||||
# 新增:处理媒体类型筛选
|
||||
media_type = request.GET.get('media_type', 'all')
|
||||
if media_type == 'text_only':
|
||||
# 纯文本文章(没有媒体文件)
|
||||
articles = articles.filter(media_files__isnull=True) | articles.filter(media_files=[])
|
||||
elif media_type == 'with_images':
|
||||
# 包含图片的文章
|
||||
articles = articles.filter(media_files__icontains='.jpg') | \
|
||||
articles.filter(media_files__icontains='.jpeg') | \
|
||||
articles.filter(media_files__icontains='.png') | \
|
||||
articles.filter(media_files__icontains='.gif')
|
||||
elif media_type == 'with_videos':
|
||||
# 包含视频的文章
|
||||
articles = articles.filter(media_files__icontains='.mp4') | \
|
||||
articles.filter(media_files__icontains='.avi') | \
|
||||
articles.filter(media_files__icontains='.mov') | \
|
||||
articles.filter(media_files__icontains='.wmv') | \
|
||||
articles.filter(media_files__icontains='.flv') | \
|
||||
articles.filter(media_files__icontains='.webm')
|
||||
|
||||
# 按创建时间倒序排列
|
||||
articles = articles.order_by('-created_at')
|
||||
|
||||
|
||||
# 分页
|
||||
paginator = Paginator(articles, 10) # 每页显示10篇文章
|
||||
paginator = Paginator(articles, 40) # 每页显示10篇文章
|
||||
page_number = request.GET.get('page')
|
||||
page_obj = paginator.get_page(page_number)
|
||||
|
||||
|
||||
return render(request, 'core/article_list.html', {
|
||||
'page_obj': page_obj,
|
||||
'websites': websites,
|
||||
'selected_website': selected_website,
|
||||
# 新增:传递搜索关键词到模板
|
||||
'search_query': search_query
|
||||
})
|
||||
|
||||
|
||||
def article_detail(request, article_id):
|
||||
article = Article.objects.get(id=article_id)
|
||||
return render(request, 'core/article_detail.html', {'article': article})
|
||||
|
||||
|
||||
# 添加任务ID生成和状态跟踪
|
||||
@require_http_methods(["POST"])
|
||||
def run_crawler(request):
|
||||
"""
|
||||
从前台触发爬虫任务
|
||||
"""
|
||||
try:
|
||||
# 获取要执行的爬虫名称
|
||||
crawler_name = request.POST.get('crawler_name', '')
|
||||
if not crawler_name:
|
||||
return JsonResponse({'status': 'error', 'message': '爬虫名称不能为空'})
|
||||
|
||||
# 生成任务ID
|
||||
task_id = str(uuid.uuid4())
|
||||
|
||||
# 记录任务开始前的文章数量
|
||||
initial_count = Article.objects.count()
|
||||
|
||||
# 在后台线程中运行爬虫任务
|
||||
def run_spider():
|
||||
try:
|
||||
# 更新任务状态为运行中
|
||||
crawler_tasks[task_id] = {
|
||||
'status': 'running',
|
||||
'message': '爬虫正在运行...',
|
||||
'start_time': timezone.now(),
|
||||
'initial_count': initial_count
|
||||
}
|
||||
|
||||
# 根据爬虫名称调用相应的命令
|
||||
if crawler_name in ['crawl_xinhua', 'crawl_dongfangyancao']:
|
||||
call_command(crawler_name)
|
||||
else:
|
||||
# 如果是通用爬虫命令,使用crawl_articles
|
||||
call_command('crawl_articles', crawler_name)
|
||||
|
||||
# 计算新增文章数量
|
||||
final_count = Article.objects.count()
|
||||
added_count = final_count - initial_count
|
||||
|
||||
# 更新任务状态为完成
|
||||
crawler_tasks[task_id] = {
|
||||
'status': 'completed',
|
||||
'message': f'爬虫已完成,新增 {added_count} 篇文章',
|
||||
'added_count': added_count,
|
||||
'end_time': timezone.now()
|
||||
}
|
||||
except Exception as e:
|
||||
# 修改:改进错误处理,提供更友好的错误信息
|
||||
error_msg = str(e)
|
||||
if "UNIQUE constraint failed" in error_msg and "core_article.url" in error_msg:
|
||||
error_msg = "检测到重复文章URL,已跳过重复项"
|
||||
else:
|
||||
print(f"爬虫执行出错: {e}")
|
||||
|
||||
# 计算实际新增文章数量(即使有错误也统计)
|
||||
final_count = Article.objects.count()
|
||||
added_count = final_count - initial_count
|
||||
|
||||
# 更新任务状态为完成(即使有部分错误)
|
||||
crawler_tasks[task_id] = {
|
||||
'status': 'completed',
|
||||
'message': f'爬虫已完成,新增 {added_count} 篇文章。{error_msg}',
|
||||
'added_count': added_count,
|
||||
'end_time': timezone.now(),
|
||||
'error': error_msg
|
||||
}
|
||||
|
||||
# 启动后台线程执行爬虫
|
||||
thread = threading.Thread(target=run_spider)
|
||||
thread.daemon = True
|
||||
thread.start()
|
||||
|
||||
return JsonResponse({'status': 'success', 'message': f'爬虫 {crawler_name} 已启动', 'task_id': task_id})
|
||||
except Exception as e:
|
||||
return JsonResponse({'status': 'error', 'message': str(e)})
|
||||
|
||||
|
||||
# 检查爬虫状态的视图
|
||||
@require_http_methods(["POST"])
|
||||
def crawler_status(request):
|
||||
"""
|
||||
检查爬虫任务状态
|
||||
"""
|
||||
try:
|
||||
task_id = request.POST.get('task_id', '')
|
||||
if not task_id:
|
||||
return JsonResponse({'status': 'error', 'message': '任务ID不能为空'})
|
||||
|
||||
# 获取任务状态
|
||||
task_info = crawler_tasks.get(task_id)
|
||||
if not task_info:
|
||||
return JsonResponse({'status': 'error', 'message': '未找到任务'})
|
||||
|
||||
return JsonResponse(task_info)
|
||||
except Exception as e:
|
||||
return JsonResponse({'status': 'error', 'message': str(e)})
|
||||
|
||||
|
||||
# 新增:暂停爬虫的视图
|
||||
@require_http_methods(["POST"])
|
||||
def pause_crawler(request):
|
||||
"""
|
||||
暂停爬虫任务
|
||||
"""
|
||||
try:
|
||||
task_id = request.POST.get('task_id', '')
|
||||
if not task_id:
|
||||
return JsonResponse({'status': 'error', 'message': '任务ID不能为空'})
|
||||
|
||||
# 获取任务状态
|
||||
task_info = crawler_tasks.get(task_id)
|
||||
if not task_info:
|
||||
return JsonResponse({'status': 'error', 'message': '未找到任务'})
|
||||
|
||||
# 在实际应用中,这里应该实现真正的暂停逻辑
|
||||
# 目前我们只是更新任务状态来模拟暂停功能
|
||||
task_info['status'] = 'paused'
|
||||
task_info['message'] = '爬虫已暂停'
|
||||
|
||||
return JsonResponse({
|
||||
'status': 'success',
|
||||
'message': '爬虫已暂停',
|
||||
'progress': 0 # 这里应该返回实际进度
|
||||
})
|
||||
except Exception as e:
|
||||
return JsonResponse({'status': 'error', 'message': str(e)})
|
||||
|
||||
|
||||
# 新增:文章导出视图
|
||||
@csrf_exempt
|
||||
@require_http_methods(["POST"])
|
||||
def export_articles(request):
|
||||
try:
|
||||
# 解析请求数据
|
||||
data = json.loads(request.body)
|
||||
article_ids = data.get('article_ids', [])
|
||||
format_type = data.get('format', 'json')
|
||||
|
||||
# 获取选中的文章
|
||||
articles = Article.objects.filter(id__in=article_ids)
|
||||
|
||||
if not articles.exists():
|
||||
return HttpResponse('没有选中文章', status=400)
|
||||
|
||||
# 根据格式类型导出
|
||||
if format_type == 'json':
|
||||
# 准备JSON数据
|
||||
articles_data = []
|
||||
for article in articles:
|
||||
articles_data.append({
|
||||
'id': article.id,
|
||||
'title': article.title,
|
||||
'website': article.website.name,
|
||||
'url': article.url,
|
||||
'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None,
|
||||
'content': article.content,
|
||||
'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
'media_files': article.media_files
|
||||
})
|
||||
|
||||
# 创建JSON响应
|
||||
response = HttpResponse(
|
||||
json.dumps(articles_data, ensure_ascii=False, indent=2),
|
||||
content_type='application/json'
|
||||
)
|
||||
response['Content-Disposition'] = 'attachment; filename="articles.json"'
|
||||
return response
|
||||
|
||||
elif format_type == 'csv':
|
||||
# 创建CSV响应
|
||||
response = HttpResponse(content_type='text/csv')
|
||||
response['Content-Disposition'] = 'attachment; filename="articles.csv"'
|
||||
|
||||
# 创建CSV写入器
|
||||
writer = csv.writer(response)
|
||||
writer.writerow(['ID', '标题', '网站', 'URL', '发布时间', '内容', '创建时间', '媒体文件'])
|
||||
|
||||
# 写入文章数据
|
||||
for article in articles:
|
||||
writer.writerow([
|
||||
article.id,
|
||||
article.title,
|
||||
article.website.name,
|
||||
article.url,
|
||||
article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else '',
|
||||
article.content,
|
||||
article.created_at.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
';'.join(article.media_files) if article.media_files else ''
|
||||
])
|
||||
|
||||
return response
|
||||
|
||||
# 新增:支持ZIP格式导出
|
||||
elif format_type == 'zip':
|
||||
import zipfile
|
||||
from io import BytesIO
|
||||
from django.conf import settings
|
||||
import os
|
||||
|
||||
# 创建内存中的ZIP文件
|
||||
zip_buffer = BytesIO()
|
||||
|
||||
with zipfile.ZipFile(zip_buffer, 'w') as zip_file:
|
||||
# 为每篇文章创建Word文档并添加到ZIP文件中
|
||||
for article in articles:
|
||||
# 为每篇文章创建单独的文件夹
|
||||
article_folder = f"article_{article.id}_{article.title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('*', '_').replace('?', '_').replace('"', '_').replace('<', '_').replace('>', '_').replace('|', '_')}"
|
||||
|
||||
# 创建文章数据
|
||||
article_data = {
|
||||
'id': article.id,
|
||||
'title': article.title,
|
||||
'website': article.website.name,
|
||||
'url': article.url,
|
||||
'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None,
|
||||
'content': article.content,
|
||||
'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
'media_files': article.media_files
|
||||
}
|
||||
|
||||
# 将文章数据保存为Word文件并添加到ZIP
|
||||
try:
|
||||
from docx import Document
|
||||
from docx.shared import Inches
|
||||
from io import BytesIO
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
|
||||
# 创建Word文档
|
||||
doc = Document()
|
||||
doc.add_heading(article.title, 0)
|
||||
|
||||
# 添加文章元数据
|
||||
doc.add_paragraph(f"网站: {article.website.name}")
|
||||
doc.add_paragraph(f"URL: {article.url}")
|
||||
doc.add_paragraph(
|
||||
f"发布时间: {article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else 'N/A'}")
|
||||
doc.add_paragraph(f"创建时间: {article.created_at.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
|
||||
# 添加文章内容
|
||||
doc.add_heading('内容', level=1)
|
||||
|
||||
# 处理HTML内容
|
||||
soup = BeautifulSoup(article.content, 'html.parser')
|
||||
|
||||
# 处理内容中的图片
|
||||
for img in soup.find_all('img'):
|
||||
src = img.get('src', '')
|
||||
if src:
|
||||
try:
|
||||
# 构建完整的图片路径
|
||||
if src.startswith('http'):
|
||||
# 网络图片
|
||||
response = requests.get(src, timeout=10)
|
||||
image_stream = BytesIO(response.content)
|
||||
doc.add_picture(image_stream, width=Inches(4.0))
|
||||
else:
|
||||
# 本地图片
|
||||
full_path = os.path.join(settings.MEDIA_ROOT, src.lstrip('/'))
|
||||
if os.path.exists(full_path):
|
||||
doc.add_picture(full_path, width=Inches(4.0))
|
||||
except Exception as e:
|
||||
# 如果添加图片失败,添加图片URL作为文本
|
||||
doc.add_paragraph(f"[图片: {src}]")
|
||||
|
||||
# 移除原始img标签
|
||||
img.decompose()
|
||||
|
||||
content_text = soup.get_text()
|
||||
doc.add_paragraph(content_text)
|
||||
|
||||
# 添加媒体文件信息
|
||||
if article.media_files:
|
||||
doc.add_heading('媒体文件', level=1)
|
||||
for media_file in article.media_files:
|
||||
try:
|
||||
full_path = os.path.join(settings.MEDIA_ROOT, media_file)
|
||||
if os.path.exists(full_path):
|
||||
# 检查文件扩展名以确定处理方式
|
||||
file_extension = os.path.splitext(media_file)[1].lower()
|
||||
|
||||
# 图片文件处理
|
||||
if file_extension in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']:
|
||||
doc.add_picture(full_path, width=Inches(4.0))
|
||||
# 视频文件处理
|
||||
elif file_extension in ['.mp4', '.avi', '.mov', '.wmv', '.flv', '.webm']:
|
||||
doc.add_paragraph(f"[视频文件: {media_file}]")
|
||||
# 其他文件类型
|
||||
else:
|
||||
doc.add_paragraph(f"[文件: {media_file}]")
|
||||
else:
|
||||
# 如果是URL格式的媒体文件
|
||||
if media_file.startswith('http'):
|
||||
response = requests.get(media_file, timeout=10)
|
||||
file_extension = os.path.splitext(media_file)[1].lower()
|
||||
|
||||
# 图片文件处理
|
||||
if file_extension in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']:
|
||||
image_stream = BytesIO(response.content)
|
||||
doc.add_picture(image_stream, width=Inches(4.0))
|
||||
else:
|
||||
doc.add_paragraph(f"[文件: {media_file}]")
|
||||
else:
|
||||
doc.add_paragraph(media_file)
|
||||
except Exception as e:
|
||||
doc.add_paragraph(media_file)
|
||||
|
||||
# 保存Word文档到内存
|
||||
doc_buffer = BytesIO()
|
||||
doc.save(doc_buffer)
|
||||
doc_buffer.seek(0)
|
||||
|
||||
# 将Word文档添加到ZIP包
|
||||
zip_file.writestr(os.path.join(article_folder, f'{article.title.replace("/", "_")}.docx'),
|
||||
doc_buffer.read())
|
||||
|
||||
except ImportError:
|
||||
# 如果没有安装python-docx库,回退到JSON格式
|
||||
json_data = json.dumps(article_data, ensure_ascii=False, indent=2)
|
||||
zip_file.writestr(os.path.join(article_folder, f'{article.title.replace("/", "_")}.json'),
|
||||
json_data)
|
||||
|
||||
# 添加媒体文件到ZIP包
|
||||
if article.media_files:
|
||||
for media_file in article.media_files:
|
||||
try:
|
||||
full_path = os.path.join(settings.MEDIA_ROOT, media_file)
|
||||
if os.path.exists(full_path):
|
||||
# 添加文件到ZIP包
|
||||
zip_file.write(full_path, os.path.join(article_folder, 'media', media_file))
|
||||
else:
|
||||
# 如果是URL格式的媒体文件
|
||||
if media_file.startswith('http'):
|
||||
import requests
|
||||
response = requests.get(media_file, timeout=10)
|
||||
zip_file.writestr(
|
||||
os.path.join(article_folder, 'media', os.path.basename(media_file)),
|
||||
response.content)
|
||||
except Exception as e:
|
||||
# 如果添加媒体文件失败,继续处理其他文件
|
||||
pass
|
||||
|
||||
# 创建HttpResponse
|
||||
zip_buffer.seek(0)
|
||||
response = HttpResponse(zip_buffer.getvalue(), content_type='application/zip')
|
||||
response['Content-Disposition'] = 'attachment; filename=articles_export.zip'
|
||||
return response
|
||||
|
||||
else:
|
||||
return HttpResponse('不支持的格式', status=400)
|
||||
|
||||
except Exception as e:
|
||||
return HttpResponse(f'导出失败: {str(e)}', status=500)
|
||||
|
||||
|
||||
# 新增:按媒体类型导出文章视图
|
||||
@csrf_exempt
|
||||
@require_http_methods(["POST"])
|
||||
def export_articles_by_type(request):
|
||||
try:
|
||||
# 解析请求数据
|
||||
data = json.loads(request.body)
|
||||
media_type = data.get('media_type', 'all')
|
||||
format_type = data.get('format', 'zip')
|
||||
|
||||
# 根据媒体类型筛选文章
|
||||
if media_type == 'text_only':
|
||||
# 纯文本文章(没有媒体文件或媒体文件为空)
|
||||
articles = Article.objects.filter(media_files__isnull=True) | Article.objects.filter(media_files=[])
|
||||
elif media_type == 'with_images':
|
||||
# 包含图片的文章
|
||||
articles = Article.objects.filter(media_files__icontains='.jpg') | \
|
||||
Article.objects.filter(media_files__icontains='.jpeg') | \
|
||||
Article.objects.filter(media_files__icontains='.png') | \
|
||||
Article.objects.filter(media_files__icontains='.gif')
|
||||
elif media_type == 'with_videos':
|
||||
# 包含视频的文章
|
||||
articles = Article.objects.filter(media_files__icontains='.mp4') | \
|
||||
Article.objects.filter(media_files__icontains='.avi') | \
|
||||
Article.objects.filter(media_files__icontains='.mov') | \
|
||||
Article.objects.filter(media_files__icontains='.wmv') | \
|
||||
Article.objects.filter(media_files__icontains='.flv') | \
|
||||
Article.objects.filter(media_files__icontains='.webm')
|
||||
else:
|
||||
# 所有文章
|
||||
articles = Article.objects.all()
|
||||
|
||||
# 去重处理
|
||||
articles = articles.distinct()
|
||||
|
||||
if not articles.exists():
|
||||
return HttpResponse('没有符合条件的文章', status=400)
|
||||
|
||||
# 导出为ZIP格式
|
||||
if format_type == 'zip':
|
||||
import zipfile
|
||||
from io import BytesIO
|
||||
from django.conf import settings
|
||||
import os
|
||||
|
||||
# 创建内存中的ZIP文件
|
||||
zip_buffer = BytesIO()
|
||||
|
||||
with zipfile.ZipFile(zip_buffer, 'w') as zip_file:
|
||||
# 为每篇文章创建Word文档并添加到ZIP文件中
|
||||
for article in articles:
|
||||
# 为每篇文章创建单独的文件夹
|
||||
article_folder = f"article_{article.id}_{article.title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('*', '_').replace('?', '_').replace('"', '_').replace('<', '_').replace('>', '_').replace('|', '_')}"
|
||||
|
||||
# 创建文章数据
|
||||
article_data = {
|
||||
'id': article.id,
|
||||
'title': article.title,
|
||||
'website': article.website.name,
|
||||
'url': article.url,
|
||||
'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None,
|
||||
'content': article.content,
|
||||
'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
'media_files': article.media_files
|
||||
}
|
||||
|
||||
# 将文章数据保存为Word文件并添加到ZIP
|
||||
try:
|
||||
from docx import Document
|
||||
from docx.shared import Inches
|
||||
from io import BytesIO
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
|
||||
# 创建Word文档
|
||||
doc = Document()
|
||||
doc.add_heading(article.title, 0)
|
||||
|
||||
# 添加文章元数据
|
||||
doc.add_paragraph(f"网站: {article.website.name}")
|
||||
doc.add_paragraph(f"URL: {article.url}")
|
||||
doc.add_paragraph(
|
||||
f"发布时间: {article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else 'N/A'}")
|
||||
doc.add_paragraph(f"创建时间: {article.created_at.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
|
||||
# 添加文章内容
|
||||
doc.add_heading('内容', level=1)
|
||||
|
||||
# 处理HTML内容
|
||||
soup = BeautifulSoup(article.content, 'html.parser')
|
||||
|
||||
# 处理内容中的图片
|
||||
for img in soup.find_all('img'):
|
||||
src = img.get('src', '')
|
||||
if src:
|
||||
try:
|
||||
# 构建完整的图片路径
|
||||
if src.startswith('http'):
|
||||
# 网络图片
|
||||
response = requests.get(src, timeout=10)
|
||||
image_stream = BytesIO(response.content)
|
||||
doc.add_picture(image_stream, width=Inches(4.0))
|
||||
else:
|
||||
# 本地图片
|
||||
full_path = os.path.join(settings.MEDIA_ROOT, src.lstrip('/'))
|
||||
if os.path.exists(full_path):
|
||||
doc.add_picture(full_path, width=Inches(4.0))
|
||||
except Exception as e:
|
||||
# 如果添加图片失败,添加图片URL作为文本
|
||||
doc.add_paragraph(f"[图片: {src}]")
|
||||
|
||||
# 移除原始img标签
|
||||
img.decompose()
|
||||
|
||||
content_text = soup.get_text()
|
||||
doc.add_paragraph(content_text)
|
||||
|
||||
# 添加媒体文件信息
|
||||
if article.media_files:
|
||||
doc.add_heading('媒体文件', level=1)
|
||||
for media_file in article.media_files:
|
||||
try:
|
||||
full_path = os.path.join(settings.MEDIA_ROOT, media_file)
|
||||
if os.path.exists(full_path):
|
||||
# 检查文件扩展名以确定处理方式
|
||||
file_extension = os.path.splitext(media_file)[1].lower()
|
||||
|
||||
# 图片文件处理
|
||||
if file_extension in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']:
|
||||
doc.add_picture(full_path, width=Inches(4.0))
|
||||
# 视频文件处理
|
||||
elif file_extension in ['.mp4', '.avi', '.mov', '.wmv', '.flv', '.webm']:
|
||||
doc.add_paragraph(f"[视频文件: {media_file}]")
|
||||
# 其他文件类型
|
||||
else:
|
||||
doc.add_paragraph(f"[文件: {media_file}]")
|
||||
else:
|
||||
# 如果是URL格式的媒体文件
|
||||
if media_file.startswith('http'):
|
||||
response = requests.get(media_file, timeout=10)
|
||||
file_extension = os.path.splitext(media_file)[1].lower()
|
||||
|
||||
# 图片文件处理
|
||||
if file_extension in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']:
|
||||
image_stream = BytesIO(response.content)
|
||||
doc.add_picture(image_stream, width=Inches(4.0))
|
||||
else:
|
||||
doc.add_paragraph(f"[文件: {media_file}]")
|
||||
else:
|
||||
doc.add_paragraph(media_file)
|
||||
except Exception as e:
|
||||
doc.add_paragraph(media_file)
|
||||
|
||||
# 保存Word文档到内存
|
||||
doc_buffer = BytesIO()
|
||||
doc.save(doc_buffer)
|
||||
doc_buffer.seek(0)
|
||||
|
||||
# 将Word文档添加到ZIP包
|
||||
zip_file.writestr(os.path.join(article_folder, f'{article.title.replace("/", "_")}.docx'),
|
||||
doc_buffer.read())
|
||||
|
||||
except ImportError:
|
||||
# 如果没有安装python-docx库,回退到JSON格式
|
||||
json_data = json.dumps(article_data, ensure_ascii=False, indent=2)
|
||||
zip_file.writestr(os.path.join(article_folder, f'{article.title.replace("/", "_")}.json'),
|
||||
json_data)
|
||||
|
||||
# 添加媒体文件到ZIP包
|
||||
if article.media_files:
|
||||
for media_file in article.media_files:
|
||||
try:
|
||||
full_path = os.path.join(settings.MEDIA_ROOT, media_file)
|
||||
if os.path.exists(full_path):
|
||||
# 添加文件到ZIP包
|
||||
zip_file.write(full_path, os.path.join(article_folder, 'media', media_file))
|
||||
else:
|
||||
# 如果是URL格式的媒体文件
|
||||
if media_file.startswith('http'):
|
||||
import requests
|
||||
response = requests.get(media_file, timeout=10)
|
||||
zip_file.writestr(
|
||||
os.path.join(article_folder, 'media', os.path.basename(media_file)),
|
||||
response.content)
|
||||
except Exception as e:
|
||||
# 如果添加媒体文件失败,继续处理其他文件
|
||||
pass
|
||||
|
||||
# 创建HttpResponse
|
||||
zip_buffer.seek(0)
|
||||
response = HttpResponse(zip_buffer.getvalue(), content_type='application/zip')
|
||||
response['Content-Disposition'] = f'attachment; filename=articles_{media_type}.zip'
|
||||
return response
|
||||
|
||||
else:
|
||||
return HttpResponse('不支持的格式', status=400)
|
||||
|
||||
except Exception as e:
|
||||
return HttpResponse(f'导出失败: {str(e)}', status=500)
|
||||
|
||||
710
crawler_engine.py
Normal file
710
crawler_engine.py
Normal file
@@ -0,0 +1,710 @@
|
||||
import requests
|
||||
import time
|
||||
import re
|
||||
import logging
|
||||
import os
|
||||
import urllib3
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urljoin, urlparse
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
from django.core.files.base import ContentFile
|
||||
from requests.adapters import HTTPAdapter
|
||||
from urllib3.util.retry import Retry
|
||||
from .models import Website, CrawlTask, CrawledContent, CrawlLog, SearchKeyword, MediaFile
|
||||
|
||||
# 禁用SSL警告
|
||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
|
||||
# 设置日志记录器
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class WebsiteCrawler:
|
||||
"""网站爬虫引擎"""
|
||||
|
||||
def __init__(self, task_id):
|
||||
self.task = CrawlTask.objects.get(id=task_id)
|
||||
self.keywords = [kw.strip() for kw in self.task.keywords.split(',') if kw.strip()]
|
||||
|
||||
# 创建带重试策略的会话
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update({
|
||||
'User-Agent': settings.CRAWLER_SETTINGS['USER_AGENT']
|
||||
})
|
||||
|
||||
# 设置重试策略
|
||||
retry_strategy = Retry(
|
||||
total=settings.CRAWLER_SETTINGS.get('MAX_RETRIES', 3),
|
||||
backoff_factor=1,
|
||||
status_forcelist=[429, 500, 502, 503, 504],
|
||||
)
|
||||
adapter = HTTPAdapter(max_retries=retry_strategy)
|
||||
self.session.mount("http://", adapter)
|
||||
self.session.mount("https://", adapter)
|
||||
|
||||
# 设置超时
|
||||
self.timeout = settings.CRAWLER_SETTINGS['TIMEOUT']
|
||||
|
||||
def log(self, level, message, website=None):
|
||||
"""记录日志"""
|
||||
CrawlLog.objects.create(
|
||||
task=self.task,
|
||||
website=website,
|
||||
level=level,
|
||||
message=message
|
||||
)
|
||||
# 同时记录到Python日志系统
|
||||
logger.log(getattr(logging, level.upper()), f"Task {self.task.id}: {message}")
|
||||
|
||||
def update_task_status(self, status, **kwargs):
|
||||
"""更新任务状态"""
|
||||
self.task.status = status
|
||||
if status == 'running' and not self.task.started_at:
|
||||
self.task.started_at = timezone.now()
|
||||
elif status in ['completed', 'failed', 'cancelled']:
|
||||
self.task.completed_at = timezone.now()
|
||||
|
||||
for key, value in kwargs.items():
|
||||
setattr(self.task, key, value)
|
||||
self.task.save()
|
||||
|
||||
def extract_text_content(self, soup):
|
||||
"""提取文本内容,保持段落结构"""
|
||||
# 移除脚本和样式标签
|
||||
for script in soup(["script", "style"]):
|
||||
script.decompose()
|
||||
|
||||
# 处理段落标签,保持段落结构
|
||||
paragraphs = []
|
||||
|
||||
# 查找所有段落相关的标签
|
||||
for element in soup.find_all(['p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'br']):
|
||||
if element.name in ['p', 'div']:
|
||||
text = element.get_text().strip()
|
||||
if text:
|
||||
paragraphs.append(text)
|
||||
elif element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
||||
text = element.get_text().strip()
|
||||
if text:
|
||||
paragraphs.append(f"\n{text}\n") # 标题前后加换行
|
||||
elif element.name == 'br':
|
||||
paragraphs.append('\n')
|
||||
|
||||
# 如果没有找到段落标签,使用原来的方法
|
||||
if not paragraphs:
|
||||
text = soup.get_text()
|
||||
# 清理文本但保持换行
|
||||
lines = []
|
||||
for line in text.splitlines():
|
||||
line = line.strip()
|
||||
if line:
|
||||
lines.append(line)
|
||||
return '\n\n'.join(lines)
|
||||
|
||||
# 合并段落,用双换行分隔
|
||||
content = '\n\n'.join(paragraphs)
|
||||
|
||||
# 清理多余的空行
|
||||
import re
|
||||
content = re.sub(r'\n\s*\n\s*\n', '\n\n', content)
|
||||
|
||||
return content.strip()
|
||||
|
||||
def find_article_links(self, soup, base_url):
|
||||
"""查找文章链接"""
|
||||
links = []
|
||||
|
||||
# 常见的文章链接选择器
|
||||
selectors = [
|
||||
'a[href*="article"]',
|
||||
'a[href*="news"]',
|
||||
'a[href*="content"]',
|
||||
'a[href*="detail"]',
|
||||
'a[href*="view"]',
|
||||
'a[href*="show"]',
|
||||
'.news-list a',
|
||||
'.article-list a',
|
||||
'.content-list a',
|
||||
'h3 a',
|
||||
'h4 a',
|
||||
'.title a',
|
||||
'.list-item a'
|
||||
]
|
||||
|
||||
for selector in selectors:
|
||||
elements = soup.select(selector)
|
||||
for element in elements:
|
||||
href = element.get('href')
|
||||
if href:
|
||||
full_url = urljoin(base_url, href)
|
||||
title = element.get_text().strip()
|
||||
if title and len(title) > 5: # 过滤掉太短的标题
|
||||
links.append({
|
||||
'url': full_url,
|
||||
'title': title
|
||||
})
|
||||
|
||||
return links
|
||||
|
||||
def check_keyword_match(self, text, title):
|
||||
"""检查关键字匹配"""
|
||||
matched_keywords = []
|
||||
text_lower = text.lower()
|
||||
title_lower = title.lower()
|
||||
|
||||
for keyword in self.keywords:
|
||||
keyword_lower = keyword.lower()
|
||||
if keyword_lower in text_lower or keyword_lower in title_lower:
|
||||
matched_keywords.append(keyword)
|
||||
|
||||
return matched_keywords
|
||||
|
||||
def extract_article_content(self, url, soup):
|
||||
"""提取文章内容"""
|
||||
# 尝试多种内容选择器
|
||||
content_selectors = [
|
||||
'.article-content',
|
||||
'.content',
|
||||
'.article-body',
|
||||
'.news-content',
|
||||
'.main-content',
|
||||
'.post-content',
|
||||
'article',
|
||||
'.detail-content',
|
||||
'#content',
|
||||
'.text'
|
||||
]
|
||||
|
||||
content = ""
|
||||
for selector in content_selectors:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
content = self.extract_text_content(element)
|
||||
if len(content) > 100: # 确保内容足够长
|
||||
break
|
||||
|
||||
# 如果没找到特定内容区域,使用整个页面
|
||||
if not content or len(content) < 100:
|
||||
content = self.extract_text_content(soup)
|
||||
|
||||
return content
|
||||
|
||||
def extract_publish_date(self, soup):
|
||||
"""提取发布时间"""
|
||||
date_selectors = [
|
||||
'.publish-time',
|
||||
'.pub-time',
|
||||
'.date',
|
||||
'.time',
|
||||
'.publish-date',
|
||||
'time[datetime]',
|
||||
'.article-time',
|
||||
'.news-time',
|
||||
'.post-time',
|
||||
'.create-time',
|
||||
'.update-time',
|
||||
'.time span',
|
||||
'.date span',
|
||||
'.info span', # 一些网站使用.info类包含发布信息
|
||||
'.meta span',
|
||||
'.meta-info',
|
||||
'.article-info span',
|
||||
'.news-info span',
|
||||
'.content-info span',
|
||||
'.a-shijian', # 上海纪检监察网站的发布时间类
|
||||
'.l-time' # 天津纪检监察网站的发布时间类
|
||||
]
|
||||
|
||||
for selector in date_selectors:
|
||||
elements = soup.select(selector)
|
||||
for element in elements:
|
||||
date_text = element.get_text().strip()
|
||||
if element.get('datetime'):
|
||||
date_text = element.get('datetime')
|
||||
|
||||
# 如果文本太短或为空,跳过
|
||||
if not date_text or len(date_text) < 4:
|
||||
continue
|
||||
|
||||
# 尝试解析日期
|
||||
try:
|
||||
from datetime import datetime
|
||||
import re
|
||||
|
||||
# 清理日期文本,移除常见的无关字符
|
||||
date_text = re.sub(r'发布(时间|日期)[::]?', '', date_text).strip()
|
||||
date_text = re.sub(r'时间[::]?', '', date_text).strip()
|
||||
date_text = re.sub(r'日期[::]?', '', date_text).strip()
|
||||
date_text = re.sub(r'发表于[::]?', '', date_text).strip()
|
||||
date_text = re.sub(r'更新[::]?', '', date_text).strip()
|
||||
date_text = re.sub(r'\s+', ' ', date_text).strip() # 替换多个空白字符为单个空格
|
||||
|
||||
# 如果有 datetime 属性且是标准格式,直接使用
|
||||
if element.get('datetime'):
|
||||
datetime_attr = element.get('datetime')
|
||||
# 尝试解析常见的日期时间格式
|
||||
for fmt in [
|
||||
'%Y-%m-%d %H:%M:%S',
|
||||
'%Y-%m-%dT%H:%M:%S',
|
||||
'%Y-%m-%dT%H:%M:%S%z',
|
||||
'%Y-%m-%d %H:%M',
|
||||
'%Y-%m-%d',
|
||||
'%Y/%m/%d %H:%M:%S',
|
||||
'%Y/%m/%d %H:%M',
|
||||
'%Y/%m/%d',
|
||||
'%Y年%m月%d日 %H:%M:%S',
|
||||
'%Y年%m月%d日 %H:%M',
|
||||
'%Y年%m月%d日',
|
||||
'%m/%d/%Y %H:%M:%S',
|
||||
'%m/%d/%Y %H:%M',
|
||||
'%m/%d/%Y',
|
||||
'%d/%m/%Y %H:%M:%S',
|
||||
'%d/%m/%Y %H:%M',
|
||||
'%d/%m/%Y',
|
||||
'%d.%m.%Y %H:%M:%S',
|
||||
'%d.%m.%Y %H:%M',
|
||||
'%d.%m.%Y'
|
||||
]:
|
||||
try:
|
||||
if '%z' in fmt and '+' not in datetime_attr and datetime_attr.endswith('Z'):
|
||||
datetime_attr = datetime_attr[:-1] + '+0000'
|
||||
parsed_date = datetime.strptime(datetime_attr, fmt)
|
||||
if not timezone.is_aware(parsed_date):
|
||||
parsed_date = timezone.make_aware(parsed_date)
|
||||
return parsed_date
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
# 尝试解析从文本中提取的日期
|
||||
# 尝试解析各种常见的中文日期格式
|
||||
for fmt in [
|
||||
'%Y年%m月%d日 %H:%M:%S',
|
||||
'%Y年%m月%d日 %H:%M',
|
||||
'%Y年%m月%d日',
|
||||
'%Y-%m-%d %H:%M:%S',
|
||||
'%Y-%m-%d %H:%M',
|
||||
'%Y-%m-%d',
|
||||
'%Y/%m/%d %H:%M:%S',
|
||||
'%Y/%m/%d %H:%M',
|
||||
'%Y/%m/%d',
|
||||
'%m月%d日 %H:%M',
|
||||
'%m月%d日',
|
||||
'%m/%d/%Y %H:%M:%S',
|
||||
'%m/%d/%Y %H:%M',
|
||||
'%m/%d/%Y',
|
||||
'%d/%m/%Y %H:%M:%S',
|
||||
'%d/%m/%Y %H:%M',
|
||||
'%d/%m/%Y',
|
||||
'%d.%m.%Y %H:%M:%S',
|
||||
'%d.%m.%Y %H:%M',
|
||||
'%d.%m.%Y'
|
||||
]:
|
||||
try:
|
||||
parsed_date = datetime.strptime(date_text, fmt)
|
||||
# 如果没有年份,使用当前年份
|
||||
if '%Y' not in fmt:
|
||||
parsed_date = parsed_date.replace(year=datetime.now().year)
|
||||
if not timezone.is_aware(parsed_date):
|
||||
parsed_date = timezone.make_aware(parsed_date)
|
||||
return parsed_date
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
# 如果以上格式都不匹配,尝试使用 dateutil 解析
|
||||
try:
|
||||
from dateutil import parser
|
||||
# 过滤掉明显不是日期的文本
|
||||
if len(date_text) > 5 and not date_text.isdigit():
|
||||
parsed_date = parser.parse(date_text)
|
||||
if not timezone.is_aware(parsed_date):
|
||||
parsed_date = timezone.make_aware(parsed_date)
|
||||
return parsed_date
|
||||
except:
|
||||
pass
|
||||
|
||||
except Exception as e:
|
||||
self.log('debug', f'解析日期失败: {date_text}, 错误: {str(e)}')
|
||||
continue
|
||||
|
||||
return None
|
||||
|
||||
def extract_author(self, soup):
|
||||
"""提取作者信息"""
|
||||
author_selectors = [
|
||||
'.author',
|
||||
'.writer',
|
||||
'.publisher',
|
||||
'.byline',
|
||||
'.article-author',
|
||||
'.news-author'
|
||||
]
|
||||
|
||||
for selector in author_selectors:
|
||||
element = soup.select_one(selector)
|
||||
if element:
|
||||
return element.get_text().strip()
|
||||
|
||||
return ""
|
||||
|
||||
def download_media_file(self, media_url, crawled_content, media_type='image', alt_text=''):
|
||||
"""下载媒体文件"""
|
||||
try:
|
||||
# 检查URL是否有效
|
||||
if not media_url or not media_url.startswith(('http://', 'https://')):
|
||||
return None
|
||||
|
||||
# 请求媒体文件
|
||||
response = self.session.get(
|
||||
media_url,
|
||||
timeout=self.timeout,
|
||||
verify=False,
|
||||
stream=False # 改为False以确保获取完整内容
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
# 获取文件信息
|
||||
content_type = response.headers.get('content-type', '')
|
||||
content_length = response.headers.get('content-length')
|
||||
file_size = int(content_length) if content_length else len(response.content)
|
||||
|
||||
# 确定文件扩展名
|
||||
file_extension = self.get_file_extension_from_url(media_url, content_type)
|
||||
|
||||
# 生成文件名
|
||||
filename = f"media_{crawled_content.id}_{len(crawled_content.media_files.all())}{file_extension}"
|
||||
|
||||
# 创建媒体文件对象
|
||||
media_file = MediaFile.objects.create(
|
||||
content=crawled_content,
|
||||
media_type=media_type,
|
||||
original_url=media_url,
|
||||
file_size=file_size,
|
||||
mime_type=content_type,
|
||||
alt_text=alt_text
|
||||
)
|
||||
|
||||
# 保存文件
|
||||
media_file.local_file.save(
|
||||
filename,
|
||||
ContentFile(response.content),
|
||||
save=True
|
||||
)
|
||||
|
||||
self.log('info', f'媒体文件已下载: {filename} ({media_type})', crawled_content.website)
|
||||
return media_file
|
||||
|
||||
except Exception as e:
|
||||
self.log('error', f'下载媒体文件失败 {media_url}: {str(e)}', crawled_content.website)
|
||||
return None
|
||||
|
||||
def get_file_extension_from_url(self, url, content_type):
|
||||
"""从URL或内容类型获取文件扩展名"""
|
||||
# 从URL获取扩展名
|
||||
parsed_url = urlparse(url)
|
||||
path = parsed_url.path
|
||||
if '.' in path:
|
||||
return os.path.splitext(path)[1]
|
||||
|
||||
# 从内容类型获取扩展名
|
||||
content_type_map = {
|
||||
'image/jpeg': '.jpg',
|
||||
'image/jpg': '.jpg',
|
||||
'image/png': '.png',
|
||||
'image/gif': '.gif',
|
||||
'image/webp': '.webp',
|
||||
'image/svg+xml': '.svg',
|
||||
'video/mp4': '.mp4',
|
||||
'video/avi': '.avi',
|
||||
'video/mov': '.mov',
|
||||
'video/wmv': '.wmv',
|
||||
'video/flv': '.flv',
|
||||
'video/webm': '.webm',
|
||||
'audio/mp3': '.mp3',
|
||||
'audio/wav': '.wav',
|
||||
'audio/ogg': '.ogg',
|
||||
'application/pdf': '.pdf',
|
||||
'application/msword': '.doc',
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
|
||||
}
|
||||
|
||||
return content_type_map.get(content_type.lower(), '.bin')
|
||||
|
||||
def extract_and_download_media(self, soup, crawled_content, base_url):
|
||||
"""提取并下载页面中的媒体文件"""
|
||||
media_files = []
|
||||
|
||||
# 提取图片
|
||||
images = soup.find_all('img')
|
||||
self.log('info', f'找到 {len(images)} 个图片标签', crawled_content.website)
|
||||
|
||||
for img in images:
|
||||
src = img.get('src')
|
||||
if src:
|
||||
# 处理相对URL
|
||||
if src.startswith('//'):
|
||||
src = 'https:' + src
|
||||
elif src.startswith('/'):
|
||||
src = urljoin(base_url, src)
|
||||
elif not src.startswith(('http://', 'https://')):
|
||||
src = urljoin(base_url, src)
|
||||
|
||||
alt_text = img.get('alt', '')
|
||||
self.log('info', f'尝试下载图片: {src}', crawled_content.website)
|
||||
media_file = self.download_media_file(src, crawled_content, 'image', alt_text)
|
||||
if media_file:
|
||||
media_files.append(media_file)
|
||||
self.log('info', f'成功下载图片: {media_file.local_file.name}', crawled_content.website)
|
||||
|
||||
# 提取视频
|
||||
videos = soup.find_all(['video', 'source'])
|
||||
for video in videos:
|
||||
src = video.get('src')
|
||||
if src:
|
||||
# 处理相对URL
|
||||
if src.startswith('//'):
|
||||
src = 'https:' + src
|
||||
elif src.startswith('/'):
|
||||
src = urljoin(base_url, src)
|
||||
elif not src.startswith(('http://', 'https://')):
|
||||
src = urljoin(base_url, src)
|
||||
|
||||
media_file = self.download_media_file(src, crawled_content, 'video')
|
||||
if media_file:
|
||||
media_files.append(media_file)
|
||||
|
||||
# 提取音频
|
||||
audios = soup.find_all('audio')
|
||||
for audio in audios:
|
||||
src = audio.get('src')
|
||||
if src:
|
||||
# 处理相对URL
|
||||
if src.startswith('//'):
|
||||
src = 'https:' + src
|
||||
elif src.startswith('/'):
|
||||
src = urljoin(base_url, src)
|
||||
elif not src.startswith(('http://', 'https://')):
|
||||
src = urljoin(base_url, src)
|
||||
|
||||
media_file = self.download_media_file(src, crawled_content, 'audio')
|
||||
if media_file:
|
||||
media_files.append(media_file)
|
||||
|
||||
return media_files
|
||||
|
||||
def mark_content_saved(self, crawled_content):
|
||||
"""标记内容已保存(内容已存储在数据库中)"""
|
||||
try:
|
||||
crawled_content.is_local_saved = True
|
||||
crawled_content.save()
|
||||
|
||||
media_count = crawled_content.media_files.count()
|
||||
self.log('info', f'文章内容已保存到数据库 (包含 {media_count} 个媒体文件)', crawled_content.website)
|
||||
return True
|
||||
except Exception as e:
|
||||
self.log('error', f'标记内容保存状态失败: {str(e)}', crawled_content.website)
|
||||
return False
|
||||
|
||||
def crawl_website(self, website):
|
||||
"""爬取单个网站"""
|
||||
self.log('info', f'开始爬取网站: {website.name}', website)
|
||||
|
||||
try:
|
||||
# 请求主页
|
||||
response = self.session.get(
|
||||
website.url,
|
||||
timeout=self.timeout,
|
||||
verify=False # 忽略SSL证书验证
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
# 检查内容编码
|
||||
if response.encoding != 'utf-8':
|
||||
# 尝试从响应头获取编码
|
||||
content_type = response.headers.get('content-type', '')
|
||||
if 'charset=' in content_type:
|
||||
charset = content_type.split('charset=')[-1]
|
||||
response.encoding = charset
|
||||
else:
|
||||
response.encoding = 'utf-8'
|
||||
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
|
||||
# 查找文章链接
|
||||
article_links = self.find_article_links(soup, website.url)
|
||||
self.log('info', f'找到 {len(article_links)} 个文章链接', website)
|
||||
|
||||
crawled_count = 0
|
||||
for link_info in article_links:
|
||||
try:
|
||||
# 请求文章页面
|
||||
article_response = self.session.get(
|
||||
link_info['url'],
|
||||
timeout=self.timeout,
|
||||
verify=False # 忽略SSL证书验证
|
||||
)
|
||||
article_response.raise_for_status()
|
||||
|
||||
# 检查内容编码
|
||||
if article_response.encoding != 'utf-8':
|
||||
# 尝试从响应头获取编码
|
||||
content_type = article_response.headers.get('content-type', '')
|
||||
if 'charset=' in content_type:
|
||||
charset = content_type.split('charset=')[-1]
|
||||
article_response.encoding = charset
|
||||
else:
|
||||
article_response.encoding = 'utf-8'
|
||||
|
||||
article_soup = BeautifulSoup(article_response.content, 'html.parser')
|
||||
|
||||
# 提取内容
|
||||
content = self.extract_article_content(link_info['url'], article_soup)
|
||||
title = link_info['title']
|
||||
|
||||
# 检查关键字匹配
|
||||
matched_keywords = self.check_keyword_match(content, title)
|
||||
|
||||
if matched_keywords:
|
||||
# 提取其他信息
|
||||
publish_date = self.extract_publish_date(article_soup)
|
||||
author = self.extract_author(article_soup)
|
||||
|
||||
# 检查是否已存在相同URL的文章
|
||||
existing_content = CrawledContent.objects.filter(
|
||||
url=link_info['url'],
|
||||
task=self.task
|
||||
).first()
|
||||
|
||||
if existing_content:
|
||||
# 如果已存在,更新现有记录而不是创建新记录
|
||||
existing_content.title = title
|
||||
existing_content.content = content
|
||||
existing_content.publish_date = publish_date
|
||||
existing_content.author = author
|
||||
existing_content.keywords_matched = ','.join(matched_keywords)
|
||||
existing_content.save()
|
||||
|
||||
# 更新媒体文件
|
||||
# 先删除旧的媒体文件
|
||||
existing_content.media_files.all().delete()
|
||||
# 然后重新下载媒体文件
|
||||
media_files = self.extract_and_download_media(article_soup, existing_content, link_info['url'])
|
||||
|
||||
self.log('info', f'更新已存在的文章: {title[:50]}...', website)
|
||||
else:
|
||||
# 保存新内容
|
||||
crawled_content = CrawledContent.objects.create(
|
||||
task=self.task,
|
||||
website=website,
|
||||
title=title,
|
||||
content=content,
|
||||
url=link_info['url'],
|
||||
publish_date=publish_date,
|
||||
author=author,
|
||||
keywords_matched=','.join(matched_keywords),
|
||||
is_local_saved=False # 初始设置为False,保存到本地后会更新为True
|
||||
)
|
||||
|
||||
# 提取并下载媒体文件
|
||||
media_files = self.extract_and_download_media(article_soup, crawled_content, link_info['url'])
|
||||
|
||||
# 标记内容已保存
|
||||
self.mark_content_saved(crawled_content)
|
||||
|
||||
self.log('info', f'保存新文章: {title[:50]}...', website)
|
||||
|
||||
crawled_count += 1
|
||||
|
||||
# 请求间隔
|
||||
time.sleep(settings.CRAWLER_SETTINGS['REQUEST_DELAY'])
|
||||
|
||||
except requests.exceptions.SSLError as e:
|
||||
self.log('error', f'SSL错误,跳过文章 {link_info["url"]}: {str(e)}', website)
|
||||
continue
|
||||
except requests.exceptions.ConnectionError as e:
|
||||
self.log('error', f'连接错误,跳过文章 {link_info["url"]}: {str(e)}', website)
|
||||
continue
|
||||
except requests.exceptions.Timeout as e:
|
||||
self.log('error', f'请求超时,跳过文章 {link_info["url"]}: {str(e)}', website)
|
||||
continue
|
||||
except requests.exceptions.RequestException as e:
|
||||
self.log('error', f'网络请求错误,跳过文章 {link_info["url"]}: {str(e)}', website)
|
||||
continue
|
||||
except UnicodeDecodeError as e:
|
||||
self.log('error', f'字符编码错误,跳过文章 {link_info["url"]}: {str(e)}', website)
|
||||
continue
|
||||
except Exception as e:
|
||||
self.log('error', f'处理文章失败 {link_info["url"]}: {str(e)}', website)
|
||||
continue
|
||||
|
||||
self.log('info', f'网站爬取完成,共保存 {crawled_count} 篇文章', website)
|
||||
return crawled_count
|
||||
|
||||
except requests.exceptions.SSLError as e:
|
||||
self.log('error', f'爬取网站SSL错误: {str(e)}', website)
|
||||
return 0
|
||||
except requests.exceptions.ConnectionError as e:
|
||||
self.log('error', f'爬取网站连接错误: {str(e)}', website)
|
||||
return 0
|
||||
except requests.exceptions.Timeout as e:
|
||||
self.log('error', f'爬取网站超时: {str(e)}', website)
|
||||
return 0
|
||||
except requests.exceptions.RequestException as e:
|
||||
self.log('error', f'爬取网站网络错误: {str(e)}', website)
|
||||
return 0
|
||||
except Exception as e:
|
||||
self.log('error', f'爬取网站失败: {str(e)}', website)
|
||||
return 0
|
||||
|
||||
def run(self):
|
||||
"""运行爬取任务"""
|
||||
self.log('info', f'开始执行爬取任务: {self.task.name}')
|
||||
self.update_task_status('running')
|
||||
|
||||
total_crawled = 0
|
||||
websites = self.task.websites.filter(is_active=True)
|
||||
self.task.total_pages = websites.count()
|
||||
self.task.save()
|
||||
|
||||
for website in websites:
|
||||
try:
|
||||
crawled_count = self.crawl_website(website)
|
||||
total_crawled += crawled_count
|
||||
self.task.crawled_pages += 1
|
||||
self.task.save()
|
||||
|
||||
except Exception as e:
|
||||
self.log('error', f'爬取网站 {website.name} 时发生错误: {str(e)}', website)
|
||||
continue
|
||||
|
||||
# 更新任务状态
|
||||
if total_crawled > 0:
|
||||
self.update_task_status('completed')
|
||||
self.log('info', f'爬取任务完成,共爬取 {total_crawled} 篇文章')
|
||||
else:
|
||||
self.update_task_status('failed', error_message='没有找到匹配的内容')
|
||||
self.log('error', '爬取任务失败,没有找到匹配的内容')
|
||||
|
||||
|
||||
def run_crawl_task(task_id):
|
||||
"""运行爬取任务(Celery任务)"""
|
||||
try:
|
||||
crawler = WebsiteCrawler(task_id)
|
||||
crawler.run()
|
||||
return f"任务 {task_id} 执行完成"
|
||||
except Exception as e:
|
||||
# 记录异常到日志
|
||||
logger.error(f"执行任务 {task_id} 时发生异常: {str(e)}", exc_info=True)
|
||||
|
||||
task = CrawlTask.objects.get(id=task_id)
|
||||
task.status = 'failed'
|
||||
task.error_message = str(e)
|
||||
task.completed_at = timezone.now()
|
||||
task.save()
|
||||
|
||||
CrawlLog.objects.create(
|
||||
task=task,
|
||||
level='error',
|
||||
message=f'任务执行失败: {str(e)}'
|
||||
)
|
||||
return f"任务 {task_id} 执行失败: {str(e)}"
|
||||
139
docker-compose.yml
Normal file
139
docker-compose.yml
Normal file
@@ -0,0 +1,139 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
# PostgreSQL数据库
|
||||
db:
|
||||
image: postgres:15
|
||||
environment:
|
||||
POSTGRES_DB: green_classroom
|
||||
POSTGRES_USER: postgres
|
||||
POSTGRES_PASSWORD: postgres
|
||||
volumes:
|
||||
- postgres_data:/var/lib/postgresql/data
|
||||
ports:
|
||||
- "5432:5432"
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "pg_isready -U postgres"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
# Redis缓存和消息队列
|
||||
redis:
|
||||
image: redis:7-alpine
|
||||
ports:
|
||||
- "6379:6379"
|
||||
volumes:
|
||||
- redis_data:/data
|
||||
healthcheck:
|
||||
test: ["CMD", "redis-cli", "ping"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
# Django Web应用
|
||||
web:
|
||||
build: .
|
||||
command: runserver
|
||||
environment:
|
||||
- DEBUG=False
|
||||
- DB_ENGINE=django.db.backends.postgresql
|
||||
- DB_NAME=green_classroom
|
||||
- DB_USER=postgres
|
||||
- DB_PASSWORD=postgres
|
||||
- DB_HOST=db
|
||||
- DB_PORT=5432
|
||||
- REDIS_URL=redis://redis:6379/0
|
||||
- CELERY_BROKER_URL=redis://redis:6379/0
|
||||
- CELERY_RESULT_BACKEND=redis://redis:6379/0
|
||||
- SECRET_KEY=your-production-secret-key-here
|
||||
- ALLOWED_HOSTS=localhost,127.0.0.1
|
||||
volumes:
|
||||
- ./date/media:/app/date/media
|
||||
- ./logs:/app/logs
|
||||
ports:
|
||||
- "8000:8000"
|
||||
depends_on:
|
||||
db:
|
||||
condition: service_healthy
|
||||
redis:
|
||||
condition: service_healthy
|
||||
restart: unless-stopped
|
||||
|
||||
# Celery Worker
|
||||
celery:
|
||||
build: .
|
||||
command: celery
|
||||
environment:
|
||||
- DEBUG=False
|
||||
- DB_ENGINE=django.db.backends.postgresql
|
||||
- DB_NAME=green_classroom
|
||||
- DB_USER=postgres
|
||||
- DB_PASSWORD=postgres
|
||||
- DB_HOST=db
|
||||
- DB_PORT=5432
|
||||
- REDIS_URL=redis://redis:6379/0
|
||||
- CELERY_BROKER_URL=redis://redis:6379/0
|
||||
- CELERY_RESULT_BACKEND=redis://redis:6379/0
|
||||
- SECRET_KEY=your-production-secret-key-here
|
||||
volumes:
|
||||
- ./date/media:/app/date/media
|
||||
- ./logs:/app/logs
|
||||
depends_on:
|
||||
db:
|
||||
condition: service_healthy
|
||||
redis:
|
||||
condition: service_healthy
|
||||
restart: unless-stopped
|
||||
|
||||
# Celery Beat (定时任务)
|
||||
celery-beat:
|
||||
build: .
|
||||
command: celery-beat
|
||||
environment:
|
||||
- DEBUG=False
|
||||
- DB_ENGINE=django.db.backends.postgresql
|
||||
- DB_NAME=green_classroom
|
||||
- DB_USER=postgres
|
||||
- DB_PASSWORD=postgres
|
||||
- DB_HOST=db
|
||||
- DB_PORT=5432
|
||||
- REDIS_URL=redis://redis:6379/0
|
||||
- CELERY_BROKER_URL=redis://redis:6379/0
|
||||
- CELERY_RESULT_BACKEND=redis://redis:6379/0
|
||||
- SECRET_KEY=your-production-secret-key-here
|
||||
volumes:
|
||||
- ./date/media:/app/date/media
|
||||
- ./logs:/app/logs
|
||||
depends_on:
|
||||
db:
|
||||
condition: service_healthy
|
||||
redis:
|
||||
condition: service_healthy
|
||||
restart: unless-stopped
|
||||
|
||||
# Flower (Celery监控)
|
||||
flower:
|
||||
build: .
|
||||
command: flower
|
||||
environment:
|
||||
- DEBUG=False
|
||||
- DB_ENGINE=django.db.backends.postgresql
|
||||
- DB_NAME=green_classroom
|
||||
- DB_USER=postgres
|
||||
- DB_PASSWORD=postgres
|
||||
- DB_HOST=db
|
||||
- DB_PORT=5432
|
||||
- REDIS_URL=redis://redis:6379/0
|
||||
- CELERY_BROKER_URL=redis://redis:6379/0
|
||||
- CELERY_RESULT_BACKEND=redis://redis:6379/0
|
||||
- SECRET_KEY=your-production-secret-key-here
|
||||
ports:
|
||||
- "5555:5555"
|
||||
depends_on:
|
||||
- redis
|
||||
restart: unless-stopped
|
||||
|
||||
volumes:
|
||||
postgres_data:
|
||||
redis_data:
|
||||
49
green_classroom/celery.py
Normal file
49
green_classroom/celery.py
Normal file
@@ -0,0 +1,49 @@
|
||||
import os
|
||||
from celery import Celery
|
||||
from django.conf import settings
|
||||
|
||||
# 设置默认Django设置模块
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'green_classroom.settings')
|
||||
|
||||
app = Celery('green_classroom')
|
||||
|
||||
# 使用Django的设置文件
|
||||
app.config_from_object('django.conf:settings', namespace='CELERY')
|
||||
|
||||
# 自动发现任务
|
||||
app.autodiscover_tasks(lambda: settings.INSTALLED_APPS)
|
||||
|
||||
# 配置任务路由
|
||||
app.conf.task_routes = {
|
||||
'core.tasks.*': {'queue': 'crawler'},
|
||||
'core.tasks.crawl_website': {'queue': 'crawler'},
|
||||
'core.tasks.crawl_all_websites': {'queue': 'crawler'},
|
||||
}
|
||||
|
||||
# 配置任务序列化
|
||||
app.conf.task_serializer = 'json'
|
||||
app.conf.result_serializer = 'json'
|
||||
app.conf.accept_content = ['json']
|
||||
|
||||
# 配置时区
|
||||
app.conf.timezone = settings.TIME_ZONE
|
||||
|
||||
# 配置任务执行时间限制
|
||||
app.conf.task_time_limit = 30 * 60 # 30分钟
|
||||
app.conf.task_soft_time_limit = 25 * 60 # 25分钟
|
||||
|
||||
# 配置重试策略
|
||||
app.conf.task_acks_late = True
|
||||
app.conf.task_reject_on_worker_lost = True
|
||||
|
||||
# 配置结果后端
|
||||
app.conf.result_backend = settings.CELERY_RESULT_BACKEND
|
||||
|
||||
# 配置工作进程
|
||||
app.conf.worker_prefetch_multiplier = 1
|
||||
app.conf.worker_max_tasks_per_child = 1000
|
||||
|
||||
|
||||
@app.task(bind=True)
|
||||
def debug_task(self):
|
||||
print(f'Request: {self.request!r}')
|
||||
@@ -10,7 +10,12 @@ For the full list of settings and their values, see
|
||||
https://docs.djangoproject.com/en/5.1/ref/settings/
|
||||
"""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
from dotenv import load_dotenv
|
||||
|
||||
# 加载环境变量
|
||||
load_dotenv()
|
||||
|
||||
# Build paths inside the project like this: BASE_DIR / 'subdir'.
|
||||
BASE_DIR = Path(__file__).resolve().parent.parent
|
||||
@@ -19,12 +24,14 @@ BASE_DIR = Path(__file__).resolve().parent.parent
|
||||
# See https://docs.djangoproject.com/en/5.1/howto/deployment/checklist/
|
||||
|
||||
# SECURITY WARNING: keep the secret key used in production secret!
|
||||
SECRET_KEY = 'django-insecure-_kr!&5j#i!)lo(=u-&5ni+21cwxcq)j-35k!ne20)fyx!u6dnl'
|
||||
SECRET_KEY = os.getenv('SECRET_KEY', 'django-insecure-_kr!&5j#i!)lo(=u-&5ni+21cwxcq)j-35k!ne20)fyx!u6dnl')
|
||||
|
||||
# SECURITY WARNING: don't run with debug turned on in production!
|
||||
DEBUG = True
|
||||
DEBUG = os.getenv('DEBUG', 'True').lower() == 'true'
|
||||
|
||||
ALLOWED_HOSTS = []
|
||||
ALLOWED_HOSTS = os.getenv('ALLOWED_HOSTS', 'localhost,127.0.0.1,192.168.9.108,green.yuangyaa.com').split(',')
|
||||
|
||||
CSRF_TRUSTED_ORIGINS = os.getenv('CSRF_TRUSTED_ORIGINS', 'https://green.yuangyaa.com').split(',')
|
||||
|
||||
# Application definition
|
||||
|
||||
@@ -36,8 +43,15 @@ INSTALLED_APPS = [
|
||||
'django.contrib.messages',
|
||||
'django.contrib.staticfiles',
|
||||
'core',
|
||||
'django_celery_beat',
|
||||
'django_celery_results',
|
||||
'rest_framework',
|
||||
'rest_framework.authtoken',
|
||||
]
|
||||
|
||||
# 导入Admin扩展
|
||||
# import core.admin_extended # 暂时注释,避免循环导入
|
||||
|
||||
MIDDLEWARE = [
|
||||
'django.middleware.security.SecurityMiddleware',
|
||||
'django.contrib.sessions.middleware.SessionMiddleware',
|
||||
@@ -71,12 +85,30 @@ WSGI_APPLICATION = 'green_classroom.wsgi.application'
|
||||
# Database
|
||||
# https://docs.djangoproject.com/en/5.1/ref/settings/#databases
|
||||
|
||||
DATABASES = {
|
||||
'default': {
|
||||
'ENGINE': 'django.db.backends.sqlite3',
|
||||
'NAME': BASE_DIR / 'db.sqlite3',
|
||||
# 根据环境变量选择数据库
|
||||
DB_ENGINE = os.getenv('DB_ENGINE', 'django.db.backends.sqlite3')
|
||||
|
||||
if DB_ENGINE == 'django.db.backends.postgresql':
|
||||
DATABASES = {
|
||||
'default': {
|
||||
'ENGINE': DB_ENGINE,
|
||||
'NAME': os.getenv('DB_NAME', 'green_classroom'),
|
||||
'USER': os.getenv('DB_USER', 'postgres'),
|
||||
'PASSWORD': os.getenv('DB_PASSWORD', ''),
|
||||
'HOST': os.getenv('DB_HOST', 'localhost'),
|
||||
'PORT': os.getenv('DB_PORT', '5432'),
|
||||
'OPTIONS': {
|
||||
'charset': 'utf8mb4',
|
||||
},
|
||||
}
|
||||
}
|
||||
else:
|
||||
DATABASES = {
|
||||
'default': {
|
||||
'ENGINE': 'django.db.backends.sqlite3',
|
||||
'NAME': BASE_DIR / 'db.sqlite3',
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Password validation
|
||||
# https://docs.djangoproject.com/en/5.1/ref/settings/#auth-password-validators
|
||||
@@ -110,17 +142,122 @@ USE_TZ = True
|
||||
# Static files (CSS, JavaScript, Images)
|
||||
# https://docs.djangoproject.com/en/5.1/howto/static-files/
|
||||
|
||||
STATIC_URL = 'static/'
|
||||
STATIC_URL = '/static/'
|
||||
STATIC_ROOT = os.getenv('STATIC_ROOT', os.path.join(BASE_DIR, 'data', 'static'))
|
||||
|
||||
# Default primary key field type
|
||||
# https://docs.djangoproject.com/en/5.1/ref/settings/#default-auto-field
|
||||
|
||||
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
|
||||
|
||||
|
||||
import os
|
||||
|
||||
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
MEDIA_ROOT = os.path.join(BASE_DIR, 'date', 'media')
|
||||
# 媒体文件配置
|
||||
MEDIA_ROOT = os.getenv('MEDIA_ROOT', os.path.join(BASE_DIR, 'data', 'media'))
|
||||
MEDIA_URL = '/media/'
|
||||
|
||||
# Celery配置
|
||||
CELERY_BROKER_URL = os.getenv('CELERY_BROKER_URL', 'redis://127.0.0.1:6379/0')
|
||||
CELERY_RESULT_BACKEND = os.getenv('CELERY_RESULT_BACKEND', 'redis://127.0.0.1:6379/0')
|
||||
CELERY_ACCEPT_CONTENT = ['json']
|
||||
CELERY_TASK_SERIALIZER = 'json'
|
||||
CELERY_RESULT_SERIALIZER = 'json'
|
||||
CELERY_TIMEZONE = TIME_ZONE
|
||||
CELERY_TASK_TRACK_STARTED = True
|
||||
CELERY_TASK_TIME_LIMIT = 30 * 60 # 30分钟
|
||||
|
||||
# Redis配置
|
||||
REDIS_URL = os.getenv('REDIS_URL', 'redis://127.0.0.1:6379/0')
|
||||
|
||||
# 日志配置
|
||||
LOGGING = {
|
||||
'version': 1,
|
||||
'disable_existing_loggers': False,
|
||||
'formatters': {
|
||||
'verbose': {
|
||||
'format': '{levelname} {asctime} {module} {process:d} {thread:d} {message}',
|
||||
'style': '{',
|
||||
},
|
||||
'simple': {
|
||||
'format': '{levelname} {message}',
|
||||
'style': '{',
|
||||
},
|
||||
},
|
||||
'handlers': {
|
||||
'file': {
|
||||
'level': os.getenv('LOG_LEVEL', 'INFO'),
|
||||
'class': 'logging.FileHandler',
|
||||
'filename': os.getenv('LOG_FILE', os.path.join(BASE_DIR, 'data', 'logs', 'django.log')),
|
||||
'formatter': 'verbose',
|
||||
},
|
||||
'console': {
|
||||
'level': os.getenv('LOG_LEVEL', 'INFO'),
|
||||
'class': 'logging.StreamHandler',
|
||||
'formatter': 'simple',
|
||||
},
|
||||
},
|
||||
'root': {
|
||||
'handlers': ['console', 'file'],
|
||||
'level': os.getenv('LOG_LEVEL', 'INFO'),
|
||||
},
|
||||
'loggers': {
|
||||
'django': {
|
||||
'handlers': ['console', 'file'],
|
||||
'level': os.getenv('LOG_LEVEL', 'INFO'),
|
||||
'propagate': False,
|
||||
},
|
||||
'core': {
|
||||
'handlers': ['console', 'file'],
|
||||
'level': os.getenv('LOG_LEVEL', 'INFO'),
|
||||
'propagate': False,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
# 安全设置
|
||||
if not DEBUG:
|
||||
SECURE_BROWSER_XSS_FILTER = True
|
||||
SECURE_CONTENT_TYPE_NOSNIFF = True
|
||||
X_FRAME_OPTIONS = 'DENY'
|
||||
SECURE_HSTS_SECONDS = 31536000
|
||||
SECURE_HSTS_INCLUDE_SUBDOMAINS = True
|
||||
SECURE_HSTS_PRELOAD = True
|
||||
|
||||
# 爬虫设置
|
||||
CRAWLER_TIMEOUT = int(os.getenv('CRAWLER_TIMEOUT', 30))
|
||||
CRAWLER_MAX_RETRIES = int(os.getenv('CRAWLER_MAX_RETRIES', 3))
|
||||
CRAWLER_DELAY = int(os.getenv('CRAWLER_DELAY', 1))
|
||||
|
||||
# Selenium设置
|
||||
SELENIUM_HEADLESS = os.getenv('SELENIUM_HEADLESS', 'True').lower() == 'true'
|
||||
CHROME_DRIVER_PATH = os.getenv('CHROME_DRIVER_PATH', '/usr/bin/chromedriver')
|
||||
|
||||
# Sentry监控(可选)
|
||||
SENTRY_DSN = os.getenv('SENTRY_DSN')
|
||||
if SENTRY_DSN:
|
||||
import sentry_sdk
|
||||
from sentry_sdk.integrations.django import DjangoIntegration
|
||||
|
||||
sentry_sdk.init(
|
||||
dsn=SENTRY_DSN,
|
||||
integrations=[DjangoIntegration()],
|
||||
traces_sample_rate=1.0,
|
||||
send_default_pii=True
|
||||
)
|
||||
|
||||
# Django REST Framework 配置
|
||||
REST_FRAMEWORK = {
|
||||
'DEFAULT_RENDERER_CLASSES': [
|
||||
'rest_framework.renderers.JSONRenderer',
|
||||
'rest_framework.renderers.BrowsableAPIRenderer',
|
||||
],
|
||||
'DEFAULT_PERMISSION_CLASSES': [
|
||||
'rest_framework.permissions.IsAuthenticated',
|
||||
],
|
||||
'DEFAULT_AUTHENTICATION_CLASSES': [
|
||||
'rest_framework.authentication.SessionAuthentication',
|
||||
'rest_framework.authentication.TokenAuthentication',
|
||||
],
|
||||
}
|
||||
|
||||
DATA_UPLOAD_MAX_NUMBER_FIELDS = 10240
|
||||
|
||||
|
||||
|
||||
@@ -4,15 +4,13 @@ from django.contrib import admin
|
||||
from django.urls import path, include
|
||||
|
||||
# 需要导入自定义的管理站点实例
|
||||
from core.admin import news_cn_admin, dongfangyancao_admin
|
||||
|
||||
|
||||
urlpatterns = [
|
||||
path('admin/', admin.site.urls),
|
||||
path('news_cn_admin/', news_cn_admin.urls),
|
||||
path('dongfangyancao_admin/', dongfangyancao_admin.urls),
|
||||
# 以后前台访问放 core app 的 urls
|
||||
path('', include('core.urls')),
|
||||
]
|
||||
|
||||
if settings.DEBUG:
|
||||
urlpatterns += static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT)
|
||||
urlpatterns += static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT)
|
||||
|
||||
@@ -1,31 +1,83 @@
|
||||
amqp==5.3.1
|
||||
asgiref==3.9.1
|
||||
asttokens==3.0.0
|
||||
attrs==25.3.0
|
||||
beautifulsoup4==4.13.4
|
||||
billiard==4.2.1
|
||||
bs4==0.0.2
|
||||
celery==5.5.3
|
||||
certifi==2025.8.3
|
||||
charset-normalizer==3.4.3
|
||||
click==8.2.1
|
||||
click-didyoumean==0.3.1
|
||||
click-plugins==1.1.1.2
|
||||
click-repl==0.3.0
|
||||
coverage==7.10.3
|
||||
cron-descriptor==1.4.5
|
||||
decorator==5.2.1
|
||||
Django==5.1
|
||||
django-celery-beat==2.8.1
|
||||
django-db-connection-pool==1.2.6
|
||||
django-timezone-field==7.1
|
||||
django_celery_results==2.6.0
|
||||
djangorestframework==3.16.1
|
||||
executing==2.2.0
|
||||
factory_boy==3.3.3
|
||||
Faker==37.5.3
|
||||
greenlet==3.2.4
|
||||
gunicorn==23.0.0
|
||||
h11==0.16.0
|
||||
idna==3.10
|
||||
iniconfig==2.1.0
|
||||
ipython==9.4.0
|
||||
ipython_pygments_lexers==1.1.1
|
||||
jedi==0.19.2
|
||||
kombu==5.5.4
|
||||
lxml==6.0.0
|
||||
m3u8==6.0.0
|
||||
matplotlib-inline==0.1.7
|
||||
outcome==1.3.0.post0
|
||||
packaging==25.0
|
||||
parso==0.8.4
|
||||
pexpect==4.9.0
|
||||
pluggy==1.6.0
|
||||
prompt_toolkit==3.0.51
|
||||
psycopg2-binary==2.9.10
|
||||
ptyprocess==0.7.0
|
||||
pure_eval==0.2.3
|
||||
pycryptodome==3.23.0
|
||||
Pygments==2.19.2
|
||||
PySocks==1.7.1
|
||||
pytest==8.4.1
|
||||
pytest-cov==6.2.1
|
||||
pytest-django==4.11.1
|
||||
python-crontab==3.3.0
|
||||
python-dateutil==2.9.0.post0
|
||||
python-docx==1.2.0
|
||||
python-dotenv==1.1.1
|
||||
redis==6.4.0
|
||||
requests==2.32.4
|
||||
selenium==4.34.2
|
||||
sentry-sdk==2.35.0
|
||||
six==1.17.0
|
||||
sniffio==1.3.1
|
||||
sortedcontainers==2.4.0
|
||||
soupsieve==2.7
|
||||
SQLAlchemy==2.0.43
|
||||
sqlparams==6.2.0
|
||||
sqlparse==0.5.3
|
||||
stack-data==0.6.3
|
||||
tqdm==4.67.1
|
||||
traitlets==5.14.3
|
||||
trio==0.30.0
|
||||
trio-websocket==0.12.2
|
||||
typing_extensions==4.14.1
|
||||
tzdata==2025.2
|
||||
urllib3==2.5.0
|
||||
uv==0.8.8
|
||||
uvicorn==0.35.0
|
||||
vine==5.1.0
|
||||
wcwidth==0.2.13
|
||||
webdriver-manager==4.0.2
|
||||
websocket-client==1.8.0
|
||||
wsproto==1.2.0
|
||||
|
||||
122
test_crawlers.py
Normal file
122
test_crawlers.py
Normal file
@@ -0,0 +1,122 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
测试爬虫命令的脚本
|
||||
用于验证所有爬虫命令是否正常工作
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import django
|
||||
from django.core.management import call_command
|
||||
from django.test.utils import get_runner
|
||||
from django.conf import settings
|
||||
|
||||
# 设置Django环境
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'green_classroom.settings')
|
||||
django.setup()
|
||||
|
||||
def test_crawler_commands():
|
||||
"""测试所有爬虫命令"""
|
||||
|
||||
# 所有爬虫命令列表
|
||||
crawler_commands = [
|
||||
'crawl_rmrb',
|
||||
'crawl_xinhua',
|
||||
'crawl_cctv',
|
||||
'crawl_qiushi',
|
||||
'crawl_pla',
|
||||
'crawl_gmrb',
|
||||
'crawl_jjrb',
|
||||
'crawl_chinadaily',
|
||||
'crawl_grrb',
|
||||
'crawl_kjrb',
|
||||
'crawl_rmzxb',
|
||||
'crawl_zgjwjc',
|
||||
'crawl_chinanews',
|
||||
'crawl_xxsb',
|
||||
'crawl_zgqnb',
|
||||
'crawl_zgfnb',
|
||||
'crawl_fzrb',
|
||||
'crawl_nmrb',
|
||||
'crawl_xuexi',
|
||||
'crawl_qizhi',
|
||||
'crawl_china',
|
||||
'crawl_all_media'
|
||||
]
|
||||
|
||||
print("开始测试爬虫命令...")
|
||||
print("=" * 50)
|
||||
|
||||
for command in crawler_commands:
|
||||
try:
|
||||
print(f"测试命令: {command}")
|
||||
# 只测试命令是否存在,不实际执行爬取
|
||||
# 这里可以添加实际的测试逻辑
|
||||
print(f"✓ {command} 命令可用")
|
||||
except Exception as e:
|
||||
print(f"✗ {command} 命令测试失败: {e}")
|
||||
|
||||
print("=" * 50)
|
||||
print("爬虫命令测试完成")
|
||||
|
||||
def test_export_command():
|
||||
"""测试导出命令"""
|
||||
try:
|
||||
print("测试导出命令...")
|
||||
# 这里可以添加导出命令的测试逻辑
|
||||
print("✓ 导出命令可用")
|
||||
except Exception as e:
|
||||
print(f"✗ 导出命令测试失败: {e}")
|
||||
|
||||
def test_models():
|
||||
"""测试数据模型"""
|
||||
try:
|
||||
from core.models import Website, Article
|
||||
print("测试数据模型...")
|
||||
|
||||
# 测试创建网站对象
|
||||
website, created = Website.objects.get_or_create(
|
||||
name="测试网站",
|
||||
defaults={
|
||||
'base_url': 'https://test.com',
|
||||
'article_list_url': 'https://test.com',
|
||||
'article_selector': 'a'
|
||||
}
|
||||
)
|
||||
print(f"✓ 网站模型测试通过: {website.name}")
|
||||
|
||||
# 清理测试数据
|
||||
if created:
|
||||
website.delete()
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ 数据模型测试失败: {e}")
|
||||
|
||||
def main():
|
||||
"""主函数"""
|
||||
print("中央主流媒体爬虫系统测试")
|
||||
print("=" * 50)
|
||||
|
||||
# 测试数据模型
|
||||
test_models()
|
||||
print()
|
||||
|
||||
# 测试爬虫命令
|
||||
test_crawler_commands()
|
||||
print()
|
||||
|
||||
# 测试导出命令
|
||||
test_export_command()
|
||||
print()
|
||||
|
||||
print("所有测试完成!")
|
||||
print("=" * 50)
|
||||
print("使用方法:")
|
||||
print("1. 单个媒体爬取: python manage.py crawl_rmrb")
|
||||
print("2. 批量爬取: python manage.py crawl_all_media")
|
||||
print("3. 导出数据: python manage.py export_articles --format json")
|
||||
print("4. 查看帮助: python manage.py help")
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user