fix bugs and support all platform
This commit is contained in:
6
.gitignore
vendored
6
.gitignore
vendored
@@ -180,5 +180,11 @@ cython_debug/
|
||||
#
|
||||
#####################################
|
||||
|
||||
# 数据目录
|
||||
data/
|
||||
date/media/
|
||||
|
||||
# 配置文件
|
||||
config/
|
||||
.env
|
||||
|
||||
|
||||
517
core/admin.py
517
core/admin.py
@@ -1,517 +0,0 @@
|
||||
from .models import Website, Article
|
||||
# 添加actions相关的导入
|
||||
from django.contrib import messages
|
||||
# 添加导出功能所需导入
|
||||
import csv
|
||||
from django.http import HttpResponse
|
||||
import json
|
||||
# 添加视图函数需要的导入
|
||||
from django.shortcuts import render, redirect
|
||||
from django.urls import path
|
||||
from django.contrib import admin
|
||||
from django.core.management import call_command
|
||||
|
||||
# 添加运行爬虫的视图函数
|
||||
def run_crawler_view(request):
|
||||
"""
|
||||
管理后台运行爬虫的视图
|
||||
"""
|
||||
if request.method == 'POST':
|
||||
website_name = request.POST.get('website_name')
|
||||
if not website_name:
|
||||
messages.error(request, '请选择要爬取的网站')
|
||||
return redirect('admin:core_article_changelist')
|
||||
|
||||
try:
|
||||
# 动态获取网站对象
|
||||
website = Website.objects.get(name=website_name)
|
||||
|
||||
# 根据网站对象确定要执行的爬虫命令
|
||||
# 移除默认的通用爬虫,每个网站必须配置自己的爬虫命令
|
||||
crawler_name = getattr(website, 'crawler_command', None)
|
||||
|
||||
# 如果网站没有配置爬虫命令,则报错
|
||||
if not crawler_name:
|
||||
messages.error(request, f'网站 {website_name} 未配置爬虫命令')
|
||||
return redirect('admin:core_article_changelist')
|
||||
|
||||
# 运行爬虫命令,传递网站名称
|
||||
call_command(crawler_name, website_name)
|
||||
|
||||
messages.success(request, f'成功执行爬虫: {website_name}')
|
||||
except Website.DoesNotExist:
|
||||
messages.error(request, f'网站不存在: {website_name}')
|
||||
except Exception as e:
|
||||
messages.error(request, f'执行爬虫失败: {str(e)}')
|
||||
|
||||
return redirect('admin:core_article_changelist')
|
||||
|
||||
|
||||
@admin.register(Website)
|
||||
class WebsiteAdmin(admin.ModelAdmin):
|
||||
list_display = ('name', 'base_url', 'enabled')
|
||||
|
||||
|
||||
# 为ArticleAdmin添加自定义动作
|
||||
@admin.register(Article)
|
||||
class ArticleAdmin(admin.ModelAdmin):
|
||||
list_display = ('title', 'website', 'pub_date')
|
||||
search_fields = ('title', 'content')
|
||||
# 添加动作选项
|
||||
actions = ['delete_selected_articles', 'export_as_csv', 'export_as_json',
|
||||
'export_as_word', 'export_with_media']
|
||||
|
||||
def get_websites(self):
|
||||
"""获取所有启用的网站"""
|
||||
return Website.objects.filter(enabled=True)
|
||||
|
||||
# 重写get_urls方法,添加自定义URL
|
||||
def get_urls(self):
|
||||
urls = super().get_urls()
|
||||
custom_urls = [
|
||||
path('run-crawler/', self.admin_site.admin_view(run_crawler_view), name='run_crawler'),
|
||||
]
|
||||
return custom_urls + urls
|
||||
|
||||
def export_as_csv(self, request, queryset):
|
||||
"""导出选中的文章为CSV格式"""
|
||||
meta = self.model._meta
|
||||
field_names = [field.name for field in meta.fields]
|
||||
|
||||
response = HttpResponse(content_type='text/csv')
|
||||
response['Content-Disposition'] = 'attachment; filename={}.csv'.format(meta)
|
||||
writer = csv.writer(response)
|
||||
|
||||
writer.writerow(field_names)
|
||||
for obj in queryset:
|
||||
row = [getattr(obj, field)() if callable(getattr(obj, field)) else getattr(obj, field) for field in
|
||||
field_names]
|
||||
writer.writerow(row)
|
||||
|
||||
return response
|
||||
|
||||
export_as_csv.short_description = "导出选中文章为CSV格式"
|
||||
|
||||
def export_as_json(self, request, queryset):
|
||||
"""导出选中的文章为JSON格式"""
|
||||
response = HttpResponse(content_type='application/json')
|
||||
response['Content-Disposition'] = 'attachment; filename=articles.json'
|
||||
|
||||
# 构造要导出的数据
|
||||
articles_data = []
|
||||
for article in queryset:
|
||||
articles_data.append({
|
||||
'id': article.id,
|
||||
'title': article.title,
|
||||
'website': article.website.name,
|
||||
'url': article.url,
|
||||
'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None,
|
||||
'content': article.content,
|
||||
'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
'media_files': article.media_files
|
||||
})
|
||||
|
||||
# 写入JSON数据
|
||||
response.write(json.dumps(articles_data, ensure_ascii=False, indent=2))
|
||||
return response
|
||||
|
||||
export_as_json.short_description = "导出选中文章为JSON格式"
|
||||
|
||||
def export_as_word(self, request, queryset):
|
||||
"""导出选中的文章为Word格式"""
|
||||
try:
|
||||
from docx import Document
|
||||
from io import BytesIO
|
||||
from docx.shared import Inches
|
||||
except ImportError:
|
||||
self.message_user(request, "缺少python-docx库,请安装: pip install python-docx", messages.ERROR)
|
||||
return
|
||||
|
||||
# 创建Word文档
|
||||
doc = Document()
|
||||
doc.add_heading('文章导出', 0)
|
||||
|
||||
for article in queryset:
|
||||
# 添加文章标题
|
||||
doc.add_heading(article.title, level=1)
|
||||
|
||||
# 添加文章元数据
|
||||
doc.add_paragraph(f"网站: {article.website.name}")
|
||||
doc.add_paragraph(f"URL: {article.url}")
|
||||
doc.add_paragraph(
|
||||
f"发布时间: {article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else 'N/A'}")
|
||||
doc.add_paragraph(f"创建时间: {article.created_at.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
|
||||
# 添加文章内容
|
||||
doc.add_heading('内容', level=2)
|
||||
# 简单处理HTML内容,移除标签并处理图片
|
||||
from bs4 import BeautifulSoup
|
||||
soup = BeautifulSoup(article.content, 'html.parser')
|
||||
|
||||
# 处理内容中的图片
|
||||
for img in soup.find_all('img'):
|
||||
src = img.get('src', '')
|
||||
if src:
|
||||
# 尝试添加图片到文档
|
||||
try:
|
||||
import os
|
||||
from django.conf import settings
|
||||
import requests
|
||||
from io import BytesIO
|
||||
|
||||
# 构建完整的图片路径
|
||||
if src.startswith('http'):
|
||||
# 网络图片
|
||||
response = requests.get(src, timeout=10)
|
||||
image_stream = BytesIO(response.content)
|
||||
doc.add_picture(image_stream, width=Inches(4.0))
|
||||
else:
|
||||
# 本地图片
|
||||
full_path = os.path.join(settings.MEDIA_ROOT, src.lstrip('/'))
|
||||
if os.path.exists(full_path):
|
||||
doc.add_picture(full_path, width=Inches(4.0))
|
||||
except Exception as e:
|
||||
# 如果添加图片失败,添加图片URL作为文本
|
||||
doc.add_paragraph(f"[图片: {src}]")
|
||||
|
||||
# 移除原始img标签
|
||||
img.decompose()
|
||||
|
||||
content_text = soup.get_text()
|
||||
doc.add_paragraph(content_text)
|
||||
|
||||
# 添加媒体文件信息
|
||||
if article.media_files:
|
||||
doc.add_heading('媒体文件', level=2)
|
||||
for media_file in article.media_files:
|
||||
try:
|
||||
import os
|
||||
from django.conf import settings
|
||||
from io import BytesIO
|
||||
import requests
|
||||
|
||||
full_path = os.path.join(settings.MEDIA_ROOT, media_file)
|
||||
if os.path.exists(full_path):
|
||||
# 添加图片到文档
|
||||
doc.add_picture(full_path, width=Inches(4.0))
|
||||
else:
|
||||
# 如果是URL格式的媒体文件
|
||||
if media_file.startswith('http'):
|
||||
response = requests.get(media_file, timeout=10)
|
||||
image_stream = BytesIO(response.content)
|
||||
doc.add_picture(image_stream, width=Inches(4.0))
|
||||
else:
|
||||
doc.add_paragraph(media_file)
|
||||
except Exception as e:
|
||||
doc.add_paragraph(media_file)
|
||||
|
||||
# 添加分页符
|
||||
doc.add_page_break()
|
||||
|
||||
# 保存到内存
|
||||
buffer = BytesIO()
|
||||
doc.save(buffer)
|
||||
buffer.seek(0)
|
||||
|
||||
# 创建HttpResponse
|
||||
from django.http import HttpResponse
|
||||
response = HttpResponse(buffer.getvalue(),
|
||||
content_type='application/vnd.openxmlformats-officedocument.wordprocessingml.document')
|
||||
response['Content-Disposition'] = 'attachment; filename=articles.docx'
|
||||
return response
|
||||
|
||||
export_as_word.short_description = "导出选中文章为Word格式"
|
||||
|
||||
def export_with_media(self, request, queryset):
|
||||
"""导出选中的文章及媒体文件为ZIP包"""
|
||||
try:
|
||||
from docx import Document
|
||||
from io import BytesIO
|
||||
from docx.shared import Inches
|
||||
import zipfile
|
||||
except ImportError:
|
||||
self.message_user(request, "缺少必要库,请安装: pip install python-docx", messages.ERROR)
|
||||
return
|
||||
|
||||
# 创建内存中的ZIP文件
|
||||
zip_buffer = BytesIO()
|
||||
|
||||
with zipfile.ZipFile(zip_buffer, 'w') as zip_file:
|
||||
for article in queryset:
|
||||
# 为每篇文章创建单独的文件夹
|
||||
article_folder = f"article_{article.id}_{article.title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('*', '_').replace('?', '_').replace('"', '_').replace('<', '_').replace('>', '_').replace('|', '_')}"
|
||||
|
||||
# 创建Word文档
|
||||
doc = Document()
|
||||
doc.add_heading(article.title, 0)
|
||||
|
||||
# 添加文章元数据
|
||||
doc.add_paragraph(f"网站: {article.website.name}")
|
||||
doc.add_paragraph(f"URL: {article.url}")
|
||||
doc.add_paragraph(
|
||||
f"发布时间: {article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else 'N/A'}")
|
||||
doc.add_paragraph(f"创建时间: {article.created_at.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
|
||||
# 添加文章内容
|
||||
doc.add_heading('内容', level=2)
|
||||
# 简单处理HTML内容,移除标签并处理图片
|
||||
from bs4 import BeautifulSoup
|
||||
soup = BeautifulSoup(article.content, 'html.parser')
|
||||
|
||||
# 处理内容中的图片
|
||||
for img in soup.find_all('img'):
|
||||
src = img.get('src', '')
|
||||
if src:
|
||||
# 尝试添加图片到文档
|
||||
try:
|
||||
import os
|
||||
from django.conf import settings
|
||||
import requests
|
||||
|
||||
# 构建完整的图片路径
|
||||
if src.startswith('http'):
|
||||
# 网络图片
|
||||
response = requests.get(src, timeout=10)
|
||||
image_stream = BytesIO(response.content)
|
||||
doc.add_picture(image_stream, width=Inches(4.0))
|
||||
# 将网络文件保存到ZIP
|
||||
zip_file.writestr(os.path.join(article_folder, 'media', os.path.basename(src)),
|
||||
response.content)
|
||||
else:
|
||||
# 本地图片
|
||||
full_path = os.path.join(settings.MEDIA_ROOT, src.lstrip('/'))
|
||||
if os.path.exists(full_path):
|
||||
doc.add_picture(full_path, width=Inches(4.0))
|
||||
# 添加文件到ZIP包
|
||||
zip_file.write(full_path, os.path.join(article_folder, 'media', src.lstrip('/')))
|
||||
except Exception as e:
|
||||
# 如果添加图片失败,添加图片URL作为文本
|
||||
doc.add_paragraph(f"[图片: {src}]")
|
||||
|
||||
# 移除原始img标签
|
||||
img.decompose()
|
||||
|
||||
content_text = soup.get_text()
|
||||
doc.add_paragraph(content_text)
|
||||
|
||||
# 添加媒体文件信息并打包媒体文件
|
||||
if article.media_files:
|
||||
doc.add_heading('媒体文件', level=2)
|
||||
for media_file in article.media_files:
|
||||
try:
|
||||
import os
|
||||
from django.conf import settings
|
||||
|
||||
full_path = os.path.join(settings.MEDIA_ROOT, media_file)
|
||||
# 检查文件扩展名以确定处理方式
|
||||
file_extension = os.path.splitext(media_file)[1].lower()
|
||||
|
||||
# 图片文件处理
|
||||
if file_extension in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']:
|
||||
if os.path.exists(full_path):
|
||||
# 添加图片到文档
|
||||
doc.add_picture(full_path, width=Inches(4.0))
|
||||
# 添加文件到ZIP包
|
||||
zip_file.write(full_path, os.path.join(article_folder, 'media', media_file))
|
||||
else:
|
||||
# 如果是URL格式的媒体文件
|
||||
if media_file.startswith('http'):
|
||||
response = requests.get(media_file, timeout=10)
|
||||
image_stream = BytesIO(response.content)
|
||||
doc.add_picture(image_stream, width=Inches(4.0))
|
||||
# 将网络文件保存到ZIP
|
||||
zip_file.writestr(
|
||||
os.path.join(article_folder, 'media', os.path.basename(media_file)),
|
||||
response.content)
|
||||
else:
|
||||
doc.add_paragraph(media_file)
|
||||
# 视频文件处理
|
||||
elif file_extension in ['.mp4', '.avi', '.mov', '.wmv', '.flv', '.webm']:
|
||||
# 视频文件只添加到ZIP包中,不在Word文档中显示
|
||||
if os.path.exists(full_path):
|
||||
# 添加文件到ZIP包
|
||||
zip_file.write(full_path, os.path.join(article_folder, 'media', media_file))
|
||||
# 在Word文档中添加视频文件信息
|
||||
doc.add_paragraph(f"[视频文件: {media_file}]")
|
||||
else:
|
||||
# 如果是URL格式的媒体文件
|
||||
if media_file.startswith('http'):
|
||||
# 将网络文件保存到ZIP
|
||||
response = requests.get(media_file, timeout=10)
|
||||
zip_file.writestr(
|
||||
os.path.join(article_folder, 'media', os.path.basename(media_file)),
|
||||
response.content)
|
||||
doc.add_paragraph(f"[视频文件: {media_file}]")
|
||||
else:
|
||||
doc.add_paragraph(media_file)
|
||||
# 其他文件类型
|
||||
else:
|
||||
if os.path.exists(full_path):
|
||||
# 添加文件到ZIP包
|
||||
zip_file.write(full_path, os.path.join(article_folder, 'media', media_file))
|
||||
doc.add_paragraph(f"[文件: {media_file}]")
|
||||
else:
|
||||
# 如果是URL格式的媒体文件
|
||||
if media_file.startswith('http'):
|
||||
response = requests.get(media_file, timeout=10)
|
||||
zip_file.writestr(
|
||||
os.path.join(article_folder, 'media', os.path.basename(media_file)),
|
||||
response.content)
|
||||
doc.add_paragraph(f"[文件: {media_file}]")
|
||||
else:
|
||||
doc.add_paragraph(media_file)
|
||||
except Exception as e:
|
||||
doc.add_paragraph(media_file)
|
||||
|
||||
# 保存每篇文章的Word文档到ZIP文件中的对应文件夹
|
||||
doc_buffer = BytesIO()
|
||||
doc.save(doc_buffer)
|
||||
doc_buffer.seek(0)
|
||||
zip_file.writestr(os.path.join(article_folder, f'{article.title.replace("/", "_")}.docx'),
|
||||
doc_buffer.read())
|
||||
|
||||
# 创建HttpResponse
|
||||
zip_buffer.seek(0)
|
||||
from django.http import HttpResponse
|
||||
response = HttpResponse(zip_buffer.getvalue(), content_type='application/zip')
|
||||
response['Content-Disposition'] = 'attachment; filename=articles_export.zip'
|
||||
return response
|
||||
|
||||
export_with_media.short_description = "导出选中文章及媒体文件(ZIP包)"
|
||||
|
||||
|
||||
# 为不同网站创建专门的文章管理类
|
||||
class NewsCnArticleAdmin(admin.ModelAdmin):
|
||||
list_display = ('title', 'pub_date')
|
||||
search_fields = ('title', 'content')
|
||||
list_filter = ('pub_date',)
|
||||
actions = ['export_as_csv', 'export_as_json']
|
||||
|
||||
def get_queryset(self, request):
|
||||
qs = super().get_queryset(request)
|
||||
# 只显示新华网的文章
|
||||
return qs.filter(website__name='www.news.cn')
|
||||
|
||||
def export_as_csv(self, request, queryset):
|
||||
"""导出选中的文章为CSV格式"""
|
||||
meta = self.model._meta
|
||||
field_names = [field.name for field in meta.fields if field.name != 'content'] # 排除content字段以减小CSV大小
|
||||
|
||||
response = HttpResponse(content_type='text/csv')
|
||||
response['Content-Disposition'] = 'attachment; filename=news_cn_articles.csv'
|
||||
writer = csv.writer(response)
|
||||
|
||||
writer.writerow(field_names)
|
||||
for obj in queryset:
|
||||
row = []
|
||||
for field in field_names:
|
||||
value = getattr(obj, field)
|
||||
if callable(value):
|
||||
value = value()
|
||||
if field == 'website':
|
||||
value = value.name
|
||||
row.append(value)
|
||||
writer.writerow(row)
|
||||
|
||||
return response
|
||||
|
||||
export_as_csv.short_description = "导出选中文章为CSV格式"
|
||||
|
||||
def export_as_json(self, request, queryset):
|
||||
"""导出选中的文章为JSON格式"""
|
||||
response = HttpResponse(content_type='application/json')
|
||||
response['Content-Disposition'] = 'attachment; filename=news_cn_articles.json'
|
||||
|
||||
# 构造要导出的数据
|
||||
articles_data = []
|
||||
for article in queryset:
|
||||
articles_data.append({
|
||||
'id': article.id,
|
||||
'title': article.title,
|
||||
'website': article.website.name,
|
||||
'url': article.url,
|
||||
'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None,
|
||||
'content': article.content,
|
||||
'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
'media_files': article.media_files
|
||||
})
|
||||
|
||||
# 写入JSON数据
|
||||
response.write(json.dumps(articles_data, ensure_ascii=False, indent=2))
|
||||
return response
|
||||
|
||||
export_as_json.short_description = "导出选中文章为JSON格式"
|
||||
|
||||
|
||||
class DongfangyancaoArticleAdmin(admin.ModelAdmin):
|
||||
list_display = ('title', 'pub_date')
|
||||
search_fields = ('title', 'content')
|
||||
list_filter = ('pub_date',)
|
||||
# 添加动作选项
|
||||
actions = ['delete_selected_articles', 'delete_all_articles', 'export_as_csv', 'export_as_json']
|
||||
|
||||
def get_queryset(self, request):
|
||||
qs = super().get_queryset(request)
|
||||
# 只显示东方烟草报的文章
|
||||
return qs.filter(website__name='东方烟草报')
|
||||
|
||||
def delete_all_articles(self, request, queryset):
|
||||
"""删除当前筛选的所有文章(东方烟草报的所有文章)"""
|
||||
# 删除所有东方烟草报的文章
|
||||
deleted_count = self.get_queryset(request).delete()[0]
|
||||
self.message_user(request, f"成功删除 {deleted_count} 篇文章", messages.SUCCESS)
|
||||
|
||||
# 设置动作的显示名称
|
||||
delete_all_articles.short_description = "删除所有当前筛选的文章"
|
||||
|
||||
def export_as_csv(self, request, queryset):
|
||||
"""导出选中的文章为CSV格式"""
|
||||
meta = self.model._meta
|
||||
field_names = [field.name for field in meta.fields if field.name != 'content'] # 排除content字段以减小CSV大小
|
||||
|
||||
response = HttpResponse(content_type='text/csv')
|
||||
response['Content-Disposition'] = 'attachment; filename=dongfangyancao_articles.csv'
|
||||
writer = csv.writer(response)
|
||||
|
||||
writer.writerow(field_names)
|
||||
for obj in queryset:
|
||||
row = []
|
||||
for field in field_names:
|
||||
value = getattr(obj, field)
|
||||
if callable(value):
|
||||
value = value()
|
||||
if field == 'website':
|
||||
value = value.name
|
||||
row.append(value)
|
||||
writer.writerow(row)
|
||||
|
||||
return response
|
||||
|
||||
export_as_csv.short_description = "导出选中文章为CSV格式"
|
||||
|
||||
def export_as_json(self, request, queryset):
|
||||
"""导出选中的文章为JSON格式"""
|
||||
response = HttpResponse(content_type='application/json')
|
||||
response['Content-Disposition'] = 'attachment; filename=dongfangyancao_articles.json'
|
||||
|
||||
# 构造要导出的数据
|
||||
articles_data = []
|
||||
for article in queryset:
|
||||
articles_data.append({
|
||||
'id': article.id,
|
||||
'title': article.title,
|
||||
'website': article.website.name,
|
||||
'url': article.url,
|
||||
'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None,
|
||||
'content': article.content,
|
||||
'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
'media_files': article.media_files
|
||||
})
|
||||
|
||||
# 写入JSON数据
|
||||
response.write(json.dumps(articles_data, ensure_ascii=False, indent=2))
|
||||
return response
|
||||
|
||||
export_as_json.short_description = "导出选中文章为JSON格式"
|
||||
|
||||
# 在各自的管理站点中注册模型
|
||||
384
core/admin_extended.py
Normal file
384
core/admin_extended.py
Normal file
@@ -0,0 +1,384 @@
|
||||
"""
|
||||
Django Admin扩展
|
||||
提供增强的管理界面功能
|
||||
"""
|
||||
|
||||
import logging
|
||||
from datetime import datetime, timedelta
|
||||
from django.contrib import admin
|
||||
from django.contrib.admin import SimpleListFilter
|
||||
from django.contrib.admin.utils import model_format_dict
|
||||
from django.contrib import messages
|
||||
from django.http import HttpResponseRedirect
|
||||
from django.urls import path, reverse
|
||||
from django.utils.html import format_html
|
||||
from django.utils import timezone
|
||||
from django.db.models import Count, Q
|
||||
from django.core.cache import cache
|
||||
|
||||
from .models import Website, Article
|
||||
from .tasks import crawl_website, crawl_all_websites, cleanup_old_articles
|
||||
from .distributed_crawler import distributed_crawler
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class WebsiteStatusFilter(SimpleListFilter):
|
||||
"""网站状态过滤器"""
|
||||
title = '网站状态'
|
||||
parameter_name = 'status'
|
||||
|
||||
def lookups(self, request, model_admin):
|
||||
return (
|
||||
('enabled', '已启用'),
|
||||
('disabled', '已禁用'),
|
||||
('no_articles', '无文章'),
|
||||
('recent_crawl', '最近爬取'),
|
||||
)
|
||||
|
||||
def queryset(self, request, queryset):
|
||||
if self.value() == 'enabled':
|
||||
return queryset.filter(enabled=True)
|
||||
elif self.value() == 'disabled':
|
||||
return queryset.filter(enabled=False)
|
||||
elif self.value() == 'no_articles':
|
||||
return queryset.annotate(article_count=Count('article')).filter(article_count=0)
|
||||
elif self.value() == 'recent_crawl':
|
||||
week_ago = timezone.now() - timedelta(days=7)
|
||||
return queryset.filter(last_crawl__gte=week_ago)
|
||||
return queryset
|
||||
|
||||
|
||||
class ArticleDateFilter(SimpleListFilter):
|
||||
"""文章日期过滤器"""
|
||||
title = '发布时间'
|
||||
parameter_name = 'date_range'
|
||||
|
||||
def lookups(self, request, model_admin):
|
||||
return (
|
||||
('today', '今天'),
|
||||
('week', '本周'),
|
||||
('month', '本月'),
|
||||
('quarter', '本季度'),
|
||||
)
|
||||
|
||||
def queryset(self, request, queryset):
|
||||
now = timezone.now()
|
||||
if self.value() == 'today':
|
||||
return queryset.filter(created_at__date=now.date())
|
||||
elif self.value() == 'week':
|
||||
week_start = now - timedelta(days=now.weekday())
|
||||
return queryset.filter(created_at__gte=week_start.replace(hour=0, minute=0, second=0))
|
||||
elif self.value() == 'month':
|
||||
return queryset.filter(created_at__year=now.year, created_at__month=now.month)
|
||||
elif self.value() == 'quarter':
|
||||
quarter = (now.month - 1) // 3
|
||||
quarter_start_month = quarter * 3 + 1
|
||||
return queryset.filter(
|
||||
created_at__year=now.year,
|
||||
created_at__month__gte=quarter_start_month,
|
||||
created_at__month__lt=quarter_start_month + 3
|
||||
)
|
||||
return queryset
|
||||
|
||||
|
||||
class WebsiteAdmin(admin.ModelAdmin):
|
||||
"""网站管理"""
|
||||
list_display = [
|
||||
'name', 'base_url', 'enabled', 'article_count',
|
||||
'last_crawl_display', 'status_indicator', 'actions_column'
|
||||
]
|
||||
list_filter = [WebsiteStatusFilter, 'enabled']
|
||||
search_fields = ['name', 'base_url']
|
||||
readonly_fields = ['article_count']
|
||||
actions = ['enable_websites', 'disable_websites', 'crawl_selected', 'crawl_all']
|
||||
|
||||
fieldsets = (
|
||||
('基本信息', {
|
||||
'fields': ('name', 'base_url', 'enabled')
|
||||
}),
|
||||
('统计信息', {
|
||||
'fields': ('article_count',),
|
||||
'classes': ('collapse',)
|
||||
}),
|
||||
('时间信息', {
|
||||
'fields': (),
|
||||
'classes': ('collapse',)
|
||||
}),
|
||||
)
|
||||
|
||||
# 添加get_websites方法以支持模板中的网站选择
|
||||
def get_websites(self, request):
|
||||
"""获取所有启用的网站,用于模板中的选择框"""
|
||||
return Website.objects.filter(enabled=True)
|
||||
|
||||
def article_count(self, obj):
|
||||
"""文章数量"""
|
||||
return obj.article_set.count()
|
||||
|
||||
article_count.short_description = '文章数量'
|
||||
|
||||
def last_crawl_display(self, obj):
|
||||
"""最后爬取时间显示"""
|
||||
return '未实现'
|
||||
|
||||
last_crawl_display.short_description = '最后爬取'
|
||||
|
||||
def status_indicator(self, obj):
|
||||
"""状态指示器"""
|
||||
if obj.enabled:
|
||||
return format_html('<span style="color: green;">●</span> 正常')
|
||||
else:
|
||||
return format_html('<span style="color: red;">●</span> 禁用')
|
||||
|
||||
status_indicator.short_description = '状态'
|
||||
|
||||
def actions_column(self, obj):
|
||||
"""操作列"""
|
||||
return format_html(
|
||||
'<a href="{}" class="button">爬取</a> '
|
||||
'<a href="{}" class="button">查看文章</a>',
|
||||
reverse('admin:crawl_website', args=[obj.id]),
|
||||
reverse('admin:core_article_changelist') + f'?website__id__exact={obj.id}'
|
||||
)
|
||||
|
||||
actions_column.short_description = '操作'
|
||||
|
||||
def enable_websites(self, request, queryset):
|
||||
"""启用选中的网站"""
|
||||
updated = queryset.update(enabled=True)
|
||||
self.message_user(request, f'成功启用 {updated} 个网站')
|
||||
|
||||
enable_websites.short_description = '启用选中的网站'
|
||||
|
||||
def disable_websites(self, request, queryset):
|
||||
"""禁用选中的网站"""
|
||||
updated = queryset.update(enabled=False)
|
||||
self.message_user(request, f'成功禁用 {updated} 个网站')
|
||||
|
||||
disable_websites.short_description = '禁用选中的网站'
|
||||
|
||||
def crawl_selected(self, request, queryset):
|
||||
"""爬取选中的网站"""
|
||||
for website in queryset:
|
||||
try:
|
||||
task = crawl_website.delay(website.id)
|
||||
self.message_user(
|
||||
request,
|
||||
f'网站 {website.name} 爬取任务已启动 (任务ID: {task.id})',
|
||||
messages.SUCCESS
|
||||
)
|
||||
except Exception as e:
|
||||
error_msg = str(e)
|
||||
if "[Errno 61] Connection refused" in error_msg:
|
||||
detailed_msg = "连接被拒绝,可能是Redis或其他依赖服务未启动。请检查以下几点:\n1. Redis服务是否运行 (尝试运行: redis-server)\n2. 如果使用Docker,请确保容器正在运行\n3. 检查Django配置中的CELERY_BROKER_URL设置\n4. 在本地开发环境中,可以运行 'python manage.py runserver' 和 'celery -A myproject worker -l info' 来启动必要的服务"
|
||||
else:
|
||||
detailed_msg = error_msg
|
||||
self.message_user(
|
||||
request,
|
||||
f'网站 {website.name} 爬取任务启动失败: {detailed_msg}',
|
||||
messages.ERROR
|
||||
)
|
||||
|
||||
crawl_selected.short_description = '爬取选中的网站'
|
||||
|
||||
def crawl_all(self, request, queryset):
|
||||
try:
|
||||
task = crawl_all_websites.delay()
|
||||
self.message_user(
|
||||
request,
|
||||
f'批量爬取任务已启动 (任务ID: {task.id})',
|
||||
messages.SUCCESS
|
||||
)
|
||||
except Exception as e:
|
||||
error_msg = str(e)
|
||||
if "[Errno 61] Connection refused" in error_msg:
|
||||
detailed_msg = "连接被拒绝,可能是Redis或其他依赖服务未启动。请检查以下几点:\n1. Redis服务是否运行 (尝试运行: redis-server)\n2. 如果使用Docker,请确保容器正在运行\n3. 检查Django配置中的CELERY_BROKER_URL设置\n4. 在本地开发环境中,可以运行 'python manage.py runserver' 和 'celery -A myproject worker -l info' 来启动必要的服务"
|
||||
else:
|
||||
detailed_msg = error_msg
|
||||
self.message_user(
|
||||
request,
|
||||
f'批量爬取任务启动失败: {detailed_msg}',
|
||||
messages.ERROR
|
||||
)
|
||||
|
||||
# crawl_all.short_description = '爬取所有网站'
|
||||
|
||||
def get_urls(self):
|
||||
"""添加自定义URL"""
|
||||
urls = super().get_urls()
|
||||
custom_urls = [
|
||||
path(
|
||||
'<int:website_id>/crawl/',
|
||||
self.admin_site.admin_view(self.crawl_website_view),
|
||||
name='crawl_website',
|
||||
),
|
||||
path(
|
||||
'run-crawler/',
|
||||
self.admin_site.admin_view(self.run_crawler_view),
|
||||
name='run_crawler',
|
||||
),
|
||||
]
|
||||
return custom_urls + urls
|
||||
|
||||
def crawl_website_view(self, request, website_id):
|
||||
"""爬取单个网站视图"""
|
||||
try:
|
||||
website = Website.objects.get(id=website_id)
|
||||
task = crawl_website.delay(website_id)
|
||||
self.message_user(
|
||||
request,
|
||||
f'网站 {website.name} 爬取任务已启动 (任务ID: {task.id})',
|
||||
messages.SUCCESS
|
||||
)
|
||||
except Website.DoesNotExist:
|
||||
self.message_user(request, '网站不存在', messages.ERROR)
|
||||
except Exception as e:
|
||||
error_msg = str(e)
|
||||
if "[Errno 61] Connection refused" in error_msg:
|
||||
detailed_msg = "连接被拒绝,可能是Redis或其他依赖服务未启动。请检查以下几点:\n1. Redis服务是否运行 (尝试运行: redis-server)\n2. 如果使用Docker,请确保容器正在运行\n3. 检查Django配置中的CELERY_BROKER_URL设置\n4. 在本地开发环境中,可以运行 'python manage.py runserver' 和 'celery -A myproject worker -l info' 来启动必要的服务"
|
||||
else:
|
||||
detailed_msg = error_msg
|
||||
self.message_user(request, f'爬取任务启动失败: {detailed_msg}', messages.ERROR)
|
||||
|
||||
return HttpResponseRedirect(reverse('admin:core_website_changelist'))
|
||||
|
||||
def run_crawler_view(self, request):
|
||||
"""运行爬虫视图"""
|
||||
try:
|
||||
task = crawl_all_websites.delay()
|
||||
self.message_user(
|
||||
request,
|
||||
f'批量爬取任务已启动 (任务ID: {task.id})',
|
||||
messages.SUCCESS
|
||||
)
|
||||
except Exception as e:
|
||||
error_msg = str(e)
|
||||
if "[Errno 61] Connection refused" in error_msg:
|
||||
detailed_msg = "连接被拒绝,可能是Redis或其他依赖服务未启动。请检查以下几点:\n1. Redis服务是否运行 (尝试运行: redis-server)\n2. 如果使用Docker,请确保容器正在运行\n3. 检查Django配置中的CELERY_BROKER_URL设置\n4. 在本地开发环境中,可以运行 'python manage.py runserver' 和 'celery -A myproject worker -l info' 来启动必要的服务"
|
||||
else:
|
||||
detailed_msg = error_msg
|
||||
self.message_user(
|
||||
request,
|
||||
f'批量爬取任务启动失败: {detailed_msg}',
|
||||
messages.ERROR
|
||||
)
|
||||
|
||||
return HttpResponseRedirect(reverse('admin:core_website_changelist'))
|
||||
|
||||
|
||||
class ArticleAdmin(admin.ModelAdmin):
|
||||
"""文章管理"""
|
||||
list_display = [
|
||||
'title', 'website', 'created_at',
|
||||
'media_count', 'actions_column'
|
||||
]
|
||||
list_filter = [
|
||||
ArticleDateFilter, 'website', 'created_at'
|
||||
]
|
||||
search_fields = ['title', 'content', 'url']
|
||||
readonly_fields = ['created_at', 'media_files_display']
|
||||
date_hierarchy = 'created_at'
|
||||
|
||||
fieldsets = (
|
||||
('基本信息', {
|
||||
'fields': ('title', 'url', 'website')
|
||||
}),
|
||||
('内容', {
|
||||
'fields': ('content',)
|
||||
}),
|
||||
('媒体文件', {
|
||||
'fields': ('media_files_display',),
|
||||
'classes': ('collapse',)
|
||||
}),
|
||||
('时间信息', {
|
||||
'fields': ('created_at',),
|
||||
'classes': ('collapse',)
|
||||
}),
|
||||
)
|
||||
|
||||
def content_preview(self, obj):
|
||||
"""内容预览"""
|
||||
return obj.content[:100] + '...' if len(obj.content) > 100 else obj.content
|
||||
|
||||
content_preview.short_description = '内容预览'
|
||||
|
||||
def media_count(self, obj):
|
||||
"""媒体文件数量"""
|
||||
if obj.media_files:
|
||||
return len(obj.media_files)
|
||||
return 0
|
||||
|
||||
media_count.short_description = '媒体文件'
|
||||
|
||||
def media_files_display(self, obj):
|
||||
"""媒体文件显示"""
|
||||
if not obj.media_files:
|
||||
return '无媒体文件'
|
||||
|
||||
html = '<div style="max-height: 300px; overflow-y: auto;">'
|
||||
for i, media in enumerate(obj.media_files):
|
||||
if media.get('type') == 'image':
|
||||
html += f'<div style="margin: 10px 0;"><img src="{media["url"]}" style="max-width: 200px; max-height: 150px;" /></div>'
|
||||
elif media.get('type') == 'video':
|
||||
html += f'<div style="margin: 10px 0;"><video controls style="max-width: 200px;"><source src="{media["url"]}" type="video/mp4"></video></div>'
|
||||
html += '</div>'
|
||||
return format_html(html)
|
||||
|
||||
media_files_display.short_description = '媒体文件'
|
||||
|
||||
def actions_column(self, obj):
|
||||
"""操作列"""
|
||||
# 修改: 添加跳转到本地文章详情页的链接
|
||||
return format_html(
|
||||
'<a href="{}" target="_blank" class="button">查看原文</a> '
|
||||
'<a href="{}" target="_blank" class="button">本地查看</a>',
|
||||
obj.url,
|
||||
reverse('article_detail', args=[obj.id])
|
||||
)
|
||||
|
||||
actions_column.short_description = '操作'
|
||||
|
||||
|
||||
class CrawlerStatusAdmin(admin.ModelAdmin):
|
||||
"""爬虫状态管理"""
|
||||
change_list_template = 'admin/crawler_status.html'
|
||||
|
||||
def changelist_view(self, request, extra_context=None):
|
||||
"""爬虫状态视图"""
|
||||
# 获取分布式爬虫状态
|
||||
nodes = distributed_crawler.get_available_nodes()
|
||||
node_statuses = []
|
||||
|
||||
for node_id in nodes:
|
||||
status = distributed_crawler.get_node_status(node_id)
|
||||
node_statuses.append(status)
|
||||
|
||||
# 获取最近的批次
|
||||
batches = distributed_crawler.get_all_batches()[:10]
|
||||
|
||||
# 获取任务统计
|
||||
task_stats = {
|
||||
'active_tasks': len([n for n in node_statuses if n['active_tasks'] > 0]),
|
||||
'total_nodes': len(nodes),
|
||||
'total_batches': len(batches),
|
||||
}
|
||||
|
||||
extra_context = extra_context or {}
|
||||
extra_context.update({
|
||||
'nodes': node_statuses,
|
||||
'batches': batches,
|
||||
'task_stats': task_stats,
|
||||
})
|
||||
|
||||
return super().changelist_view(request, extra_context)
|
||||
|
||||
|
||||
# 注册管理类
|
||||
admin.site.register(Website, WebsiteAdmin)
|
||||
admin.site.register(Article, ArticleAdmin)
|
||||
|
||||
# 自定义管理站点标题
|
||||
admin.site.site_header = 'Green Classroom 管理系统'
|
||||
admin.site.site_title = 'Green Classroom'
|
||||
admin.site.index_title = '欢迎使用 Green Classroom 管理系统'
|
||||
746
core/api.py
Normal file
746
core/api.py
Normal file
@@ -0,0 +1,746 @@
|
||||
"""
|
||||
RESTful API模块
|
||||
提供完整的API接口,支持爬虫管理、数据查询、任务控制
|
||||
"""
|
||||
|
||||
import logging
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict, List, Any
|
||||
import json
|
||||
import csv
|
||||
import io
|
||||
import zipfile
|
||||
|
||||
from django.http import JsonResponse, HttpResponse
|
||||
from django.views.decorators.csrf import csrf_exempt
|
||||
from django.views.decorators.http import require_http_methods
|
||||
from django.core.paginator import Paginator
|
||||
from django.db.models import Q, Count
|
||||
from django.utils import timezone
|
||||
# 添加DRF相关导入
|
||||
from rest_framework.views import APIView
|
||||
from rest_framework.response import Response
|
||||
from rest_framework.permissions import IsAuthenticated
|
||||
from rest_framework.authentication import SessionAuthentication, TokenAuthentication
|
||||
|
||||
# 添加python-docx库支持
|
||||
from docx import Document
|
||||
|
||||
# 添加BeautifulSoup导入
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from .models import Website, Article
|
||||
from .tasks import crawl_website, cleanup_old_articles
|
||||
from .distributed_crawler import distributed_crawler
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def api_response(data=None, message="", status=200, error=None):
|
||||
"""统一的API响应格式"""
|
||||
response = {
|
||||
"success": status < 400,
|
||||
"message": message,
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
}
|
||||
|
||||
if data is not None:
|
||||
response["data"] = data
|
||||
|
||||
if error:
|
||||
response["error"] = error
|
||||
|
||||
# 如果是DRF视图,则返回DRF Response
|
||||
if hasattr(api_response, '_use_drf_response') and api_response._use_drf_response:
|
||||
return Response(response, status=status)
|
||||
|
||||
return JsonResponse(response, status=status)
|
||||
|
||||
|
||||
# 修改健康检查接口为DRF类视图
|
||||
class HealthView(APIView):
|
||||
"""健康检查接口"""
|
||||
permission_classes = [] # 允许无认证访问
|
||||
authentication_classes = []
|
||||
|
||||
def get(self, request):
|
||||
try:
|
||||
# 检查数据库连接
|
||||
website_count = Website.objects.count()
|
||||
article_count = Article.objects.count()
|
||||
|
||||
# 检查Redis连接
|
||||
from django.core.cache import cache
|
||||
cache.set('health_check', 'ok', 60)
|
||||
cache_result = cache.get('health_check')
|
||||
|
||||
health_data = {
|
||||
"status": "healthy",
|
||||
"database": "ok",
|
||||
"redis": "ok" if cache_result == 'ok' else 'error',
|
||||
"website_count": website_count,
|
||||
"article_count": article_count,
|
||||
"uptime": "running"
|
||||
}
|
||||
|
||||
# 设置使用DRF响应
|
||||
api_response._use_drf_response = True
|
||||
return api_response(data=health_data, message="服务运行正常")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"健康检查失败: {e}")
|
||||
return api_response(
|
||||
data={"status": "unhealthy", "error": str(e)},
|
||||
message="服务异常",
|
||||
status=500,
|
||||
error=str(e)
|
||||
)
|
||||
finally:
|
||||
api_response._use_drf_response = False
|
||||
|
||||
|
||||
# 修改网站列表接口为DRF类视图
|
||||
class WebsitesView(APIView):
|
||||
"""获取网站列表"""
|
||||
permission_classes = [IsAuthenticated]
|
||||
authentication_classes = [SessionAuthentication, TokenAuthentication]
|
||||
|
||||
def get(self, request):
|
||||
try:
|
||||
# 分页参数
|
||||
page = int(request.GET.get('page', 1))
|
||||
page_size = int(request.GET.get('page_size', 20))
|
||||
search = request.GET.get('search', '')
|
||||
enabled = request.GET.get('enabled', '')
|
||||
|
||||
# 构建查询
|
||||
queryset = Website.objects.all()
|
||||
|
||||
if search:
|
||||
queryset = queryset.filter(
|
||||
Q(name__icontains=search) |
|
||||
Q(base_url__icontains=search)
|
||||
)
|
||||
|
||||
if enabled in ['true', 'false']:
|
||||
queryset = queryset.filter(enabled=enabled == 'true')
|
||||
|
||||
# 排序 - 使用id字段替代不存在的created_at字段
|
||||
queryset = queryset.order_by('-id')
|
||||
|
||||
# 分页
|
||||
paginator = Paginator(queryset, page_size)
|
||||
websites_page = paginator.get_page(page)
|
||||
|
||||
# 统计数据
|
||||
stats = {
|
||||
'total_websites': Website.objects.count(),
|
||||
'enabled_websites': Website.objects.filter(enabled=True).count(),
|
||||
'disabled_websites': Website.objects.filter(enabled=False).count(),
|
||||
}
|
||||
|
||||
# 序列化数据
|
||||
websites_data = []
|
||||
for website in websites_page:
|
||||
website_data = {
|
||||
'id': website.id,
|
||||
'name': website.name,
|
||||
'base_url': website.base_url,
|
||||
'enabled': website.enabled,
|
||||
# 移除不存在的created_at和updated_at字段
|
||||
'article_count': website.article_set.count(),
|
||||
'last_crawl': website.last_crawl.isoformat() if getattr(website, 'last_crawl', None) else None,
|
||||
}
|
||||
websites_data.append(website_data)
|
||||
|
||||
response_data = {
|
||||
'websites': websites_data,
|
||||
'pagination': {
|
||||
'page': page,
|
||||
'page_size': page_size,
|
||||
'total_pages': paginator.num_pages,
|
||||
'total_count': paginator.count,
|
||||
'has_next': websites_page.has_next(),
|
||||
'has_previous': websites_page.has_previous(),
|
||||
},
|
||||
'stats': stats
|
||||
}
|
||||
|
||||
# 设置使用DRF响应
|
||||
api_response._use_drf_response = True
|
||||
return api_response(data=response_data, message="获取网站列表成功")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"获取网站列表失败: {e}")
|
||||
return api_response(message="获取网站列表失败", status=500, error=str(e))
|
||||
finally:
|
||||
api_response._use_drf_response = False
|
||||
|
||||
|
||||
@csrf_exempt
|
||||
@require_http_methods(["GET"])
|
||||
def api_website_detail(request, website_id):
|
||||
"""获取网站详情"""
|
||||
try:
|
||||
website = Website.objects.get(id=website_id)
|
||||
|
||||
# 获取最近的文章
|
||||
recent_articles = website.article_set.order_by('-created_at')[:10]
|
||||
|
||||
website_data = {
|
||||
'id': website.id,
|
||||
'name': website.name,
|
||||
'base_url': website.base_url,
|
||||
'enabled': website.enabled,
|
||||
'created_at': website.created_at.isoformat(),
|
||||
'updated_at': website.updated_at.isoformat(),
|
||||
'last_crawl': website.last_crawl.isoformat() if website.last_crawl else None,
|
||||
'article_count': website.article_set.count(),
|
||||
'recent_articles': [
|
||||
{
|
||||
'id': article.id,
|
||||
'title': article.title,
|
||||
'url': article.url,
|
||||
'created_at': article.created_at.isoformat(),
|
||||
}
|
||||
for article in recent_articles
|
||||
]
|
||||
}
|
||||
|
||||
return api_response(data=website_data, message="获取网站详情成功")
|
||||
|
||||
except Website.DoesNotExist:
|
||||
return api_response(message="网站不存在", status=404, error="Website not found")
|
||||
except Exception as e:
|
||||
logger.error(f"获取网站详情失败: {e}")
|
||||
return api_response(message="获取网站详情失败", status=500, error=str(e))
|
||||
|
||||
|
||||
@csrf_exempt
|
||||
@require_http_methods(["POST"])
|
||||
def api_crawl_website(request, website_id):
|
||||
"""爬取指定网站"""
|
||||
try:
|
||||
website = Website.objects.get(id=website_id)
|
||||
|
||||
# 启动爬虫任务
|
||||
task = crawl_website.delay(website_id)
|
||||
|
||||
response_data = {
|
||||
'task_id': task.id,
|
||||
'website_id': website_id,
|
||||
'website_name': website.name,
|
||||
'status': 'started'
|
||||
}
|
||||
|
||||
return api_response(data=response_data, message="爬虫任务已启动")
|
||||
|
||||
except Website.DoesNotExist:
|
||||
return api_response(message="网站不存在", status=404, error="Website not found")
|
||||
except Exception as e:
|
||||
logger.error(f"启动爬虫任务失败: {e}")
|
||||
return api_response(message="启动爬虫任务失败", status=500, error=str(e))
|
||||
|
||||
|
||||
@csrf_exempt
|
||||
@require_http_methods(["GET"])
|
||||
def api_articles(request):
|
||||
"""获取文章列表"""
|
||||
try:
|
||||
# 分页参数
|
||||
page = int(request.GET.get('page', 1))
|
||||
page_size = int(request.GET.get('page_size', 20))
|
||||
search = request.GET.get('search', '')
|
||||
website_id = request.GET.get('website_id', '')
|
||||
date_from = request.GET.get('date_from', '')
|
||||
date_to = request.GET.get('date_to', '')
|
||||
|
||||
# 构建查询
|
||||
queryset = Article.objects.select_related('website').all()
|
||||
|
||||
if search:
|
||||
queryset = queryset.filter(
|
||||
Q(title__icontains=search) |
|
||||
Q(content__icontains=search)
|
||||
)
|
||||
|
||||
if website_id:
|
||||
queryset = queryset.filter(website_id=website_id)
|
||||
|
||||
if date_from:
|
||||
try:
|
||||
date_from_obj = datetime.fromisoformat(date_from.replace('Z', '+00:00'))
|
||||
queryset = queryset.filter(created_at__gte=date_from_obj)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
if date_to:
|
||||
try:
|
||||
date_to_obj = datetime.fromisoformat(date_to.replace('Z', '+00:00'))
|
||||
queryset = queryset.filter(created_at__lte=date_to_obj)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# 排序
|
||||
queryset = queryset.order_by('-created_at')
|
||||
|
||||
# 分页
|
||||
paginator = Paginator(queryset, page_size)
|
||||
articles_page = paginator.get_page(page)
|
||||
|
||||
# 统计数据
|
||||
stats = {
|
||||
'total_articles': Article.objects.count(),
|
||||
'today_articles': Article.objects.filter(
|
||||
created_at__date=timezone.now().date()
|
||||
).count(),
|
||||
'week_articles': Article.objects.filter(
|
||||
created_at__gte=timezone.now() - timedelta(days=7)
|
||||
).count(),
|
||||
}
|
||||
|
||||
# 序列化数据
|
||||
articles_data = []
|
||||
for article in articles_page:
|
||||
article_data = {
|
||||
'id': article.id,
|
||||
'title': article.title,
|
||||
'url': article.url,
|
||||
'content': article.content[:200] + '...' if len(article.content) > 200 else article.content,
|
||||
'created_at': article.created_at.isoformat(),
|
||||
'website': {
|
||||
'id': article.website.id,
|
||||
'name': article.website.name,
|
||||
},
|
||||
'media_files': article.media_files,
|
||||
}
|
||||
articles_data.append(article_data)
|
||||
|
||||
response_data = {
|
||||
'articles': articles_data,
|
||||
'pagination': {
|
||||
'page': page,
|
||||
'page_size': page_size,
|
||||
'total_pages': paginator.num_pages,
|
||||
'total_count': paginator.count,
|
||||
'has_next': articles_page.has_next(),
|
||||
'has_previous': articles_page.has_previous(),
|
||||
},
|
||||
'stats': stats
|
||||
}
|
||||
|
||||
return api_response(data=response_data, message="获取文章列表成功")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"获取文章列表失败: {e}")
|
||||
return api_response(message="获取文章列表失败", status=500, error=str(e))
|
||||
|
||||
|
||||
@csrf_exempt
|
||||
@require_http_methods(["GET"])
|
||||
def api_article_detail(request, article_id):
|
||||
"""获取文章详情"""
|
||||
try:
|
||||
article = Article.objects.select_related('website').get(id=article_id)
|
||||
|
||||
article_data = {
|
||||
'id': article.id,
|
||||
'title': article.title,
|
||||
'url': article.url,
|
||||
'content': article.content,
|
||||
'created_at': article.created_at.isoformat(),
|
||||
'website': {
|
||||
'id': article.website.id,
|
||||
'name': article.website.name,
|
||||
'base_url': article.website.base_url,
|
||||
},
|
||||
'media_files': article.media_files,
|
||||
}
|
||||
|
||||
return api_response(data=article_data, message="获取文章详情成功")
|
||||
|
||||
except Article.DoesNotExist:
|
||||
return api_response(message="文章不存在", status=404, error="Article not found")
|
||||
except Exception as e:
|
||||
logger.error(f"获取文章详情失败: {e}")
|
||||
return api_response(message="获取文章详情失败", status=500, error=str(e))
|
||||
|
||||
|
||||
@csrf_exempt
|
||||
@require_http_methods(["GET"])
|
||||
def api_crawler_status(request):
|
||||
"""获取爬虫状态"""
|
||||
try:
|
||||
# 获取分布式爬虫状态
|
||||
nodes = distributed_crawler.get_available_nodes()
|
||||
node_statuses = []
|
||||
|
||||
for node_id in nodes:
|
||||
status = distributed_crawler.get_node_status(node_id)
|
||||
node_statuses.append(status)
|
||||
|
||||
# 获取最近的批次
|
||||
batches = distributed_crawler.get_all_batches()[:10]
|
||||
|
||||
# 获取任务统计
|
||||
task_stats = {
|
||||
'active_tasks': len([n for n in node_statuses if n['active_tasks'] > 0]),
|
||||
'total_nodes': len(nodes),
|
||||
'total_batches': len(batches),
|
||||
}
|
||||
|
||||
response_data = {
|
||||
'nodes': node_statuses,
|
||||
'batches': batches,
|
||||
'stats': task_stats,
|
||||
}
|
||||
|
||||
return api_response(data=response_data, message="获取爬虫状态成功")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"获取爬虫状态失败: {e}")
|
||||
return api_response(message="获取爬虫状态失败", status=500, error=str(e))
|
||||
|
||||
|
||||
@csrf_exempt
|
||||
@require_http_methods(["POST"])
|
||||
def api_start_distributed_crawl(request):
|
||||
"""启动分布式爬取"""
|
||||
try:
|
||||
data = json.loads(request.body)
|
||||
website_ids = data.get('website_ids', [])
|
||||
|
||||
if not website_ids:
|
||||
return api_response(message="请选择要爬取的网站", status=400, error="No websites selected")
|
||||
|
||||
# 启动分布式爬取
|
||||
batch_id = distributed_crawler.distribute_crawl_tasks(website_ids)
|
||||
|
||||
if batch_id in ['no_websites', 'no_available_nodes']:
|
||||
return api_response(message="无法启动分布式爬取", status=400, error=batch_id)
|
||||
|
||||
response_data = {
|
||||
'batch_id': batch_id,
|
||||
'website_ids': website_ids,
|
||||
'status': 'started'
|
||||
}
|
||||
|
||||
return api_response(data=response_data, message="分布式爬取已启动")
|
||||
|
||||
except json.JSONDecodeError:
|
||||
return api_response(message="请求数据格式错误", status=400, error="Invalid JSON")
|
||||
except Exception as e:
|
||||
logger.error(f"启动分布式爬取失败: {e}")
|
||||
return api_response(message="启动分布式爬取失败", status=500, error=str(e))
|
||||
|
||||
|
||||
@csrf_exempt
|
||||
@require_http_methods(["GET"])
|
||||
def api_batch_status(request, batch_id):
|
||||
"""获取批次状态"""
|
||||
try:
|
||||
batch_status = distributed_crawler.get_batch_status(batch_id)
|
||||
|
||||
if batch_status.get('status') == 'not_found':
|
||||
return api_response(message="批次不存在", status=404, error="Batch not found")
|
||||
|
||||
return api_response(data=batch_status, message="获取批次状态成功")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"获取批次状态失败: {e}")
|
||||
return api_response(message="获取批次状态失败", status=500, error=str(e))
|
||||
|
||||
|
||||
@csrf_exempt
|
||||
@require_http_methods(["GET", "POST"])
|
||||
def api_cleanup_articles(request):
|
||||
"""清理旧文章"""
|
||||
# 如果是GET请求,返回清理功能的描述信息
|
||||
if request.method == "GET":
|
||||
response_data = {
|
||||
'description': '文章清理API',
|
||||
'method': 'POST',
|
||||
'parameters': {
|
||||
'days': '保留天数,默认30天'
|
||||
},
|
||||
'example': {
|
||||
'days': 30
|
||||
}
|
||||
}
|
||||
return api_response(data=response_data, message="API使用说明")
|
||||
|
||||
try:
|
||||
data = json.loads(request.body)
|
||||
days = data.get('days', 30)
|
||||
|
||||
# 启动清理任务
|
||||
task = cleanup_old_articles.delay(days)
|
||||
|
||||
response_data = {
|
||||
'task_id': task.id,
|
||||
'days': days,
|
||||
'status': 'started'
|
||||
}
|
||||
|
||||
return api_response(data=response_data, message="清理任务已启动")
|
||||
|
||||
except json.JSONDecodeError:
|
||||
return api_response(message="请求数据格式错误", status=400, error="Invalid JSON")
|
||||
except Exception as e:
|
||||
logger.error(f"启动清理任务失败: {e}")
|
||||
return api_response(message="启动清理任务失败", status=500, error=str(e))
|
||||
|
||||
|
||||
@csrf_exempt
|
||||
@require_http_methods(["GET"])
|
||||
def api_stats(request):
|
||||
"""获取统计信息"""
|
||||
try:
|
||||
# 基础统计
|
||||
total_websites = Website.objects.count()
|
||||
total_articles = Article.objects.count()
|
||||
enabled_websites = Website.objects.filter(enabled=True).count()
|
||||
|
||||
# 时间统计
|
||||
today = timezone.now().date()
|
||||
week_ago = timezone.now() - timedelta(days=7)
|
||||
month_ago = timezone.now() - timedelta(days=30)
|
||||
|
||||
today_articles = Article.objects.filter(created_at__date=today).count()
|
||||
week_articles = Article.objects.filter(created_at__gte=week_ago).count()
|
||||
month_articles = Article.objects.filter(created_at__gte=month_ago).count()
|
||||
|
||||
# 网站统计
|
||||
website_stats = []
|
||||
for website in Website.objects.all():
|
||||
website_stats.append({
|
||||
'id': website.id,
|
||||
'name': website.name,
|
||||
'article_count': website.article_set.count(),
|
||||
# 使用getattr安全访问last_crawl属性,如果不存在则返回None
|
||||
'last_crawl': website.last_crawl.isoformat() if getattr(website, 'last_crawl', None) else None,
|
||||
})
|
||||
|
||||
# 分布式爬虫统计
|
||||
nodes = distributed_crawler.get_available_nodes()
|
||||
batches = distributed_crawler.get_all_batches()
|
||||
|
||||
response_data = {
|
||||
'overview': {
|
||||
'total_websites': total_websites,
|
||||
'enabled_websites': enabled_websites,
|
||||
'total_articles': total_articles,
|
||||
'today_articles': today_articles,
|
||||
'week_articles': week_articles,
|
||||
'month_articles': month_articles,
|
||||
},
|
||||
'websites': website_stats,
|
||||
'crawler': {
|
||||
'active_nodes': len(nodes),
|
||||
'total_batches': len(batches),
|
||||
'recent_batches': batches[:5],
|
||||
}
|
||||
}
|
||||
|
||||
return api_response(data=response_data, message="获取统计信息成功")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"获取统计信息失败: {e}")
|
||||
return api_response(message="获取统计信息失败", status=500, error=str(e))
|
||||
|
||||
|
||||
@csrf_exempt
|
||||
@require_http_methods(["POST"])
|
||||
def export_articles(request):
|
||||
"""导出文章"""
|
||||
try:
|
||||
data = json.loads(request.body)
|
||||
article_ids = data.get('article_ids', [])
|
||||
export_format = data.get('format', 'docx') # 默认改为docx格式
|
||||
|
||||
if not article_ids:
|
||||
return api_response(message="请选择要导出的文章", status=400, error="No articles selected")
|
||||
|
||||
# 获取文章数据
|
||||
articles = Article.objects.filter(id__in=article_ids).select_related('website')
|
||||
|
||||
if not articles.exists():
|
||||
return api_response(message="未找到指定的文章", status=404, error="Articles not found")
|
||||
|
||||
import os # 添加导入
|
||||
from django.conf import settings # 添加导入
|
||||
|
||||
if export_format == 'json':
|
||||
# 导出为JSON格式
|
||||
articles_data = []
|
||||
for article in articles:
|
||||
articles_data.append({
|
||||
'id': article.id,
|
||||
'title': article.title,
|
||||
'url': article.url,
|
||||
'content': article.content,
|
||||
'created_at': article.created_at.isoformat(),
|
||||
'website': {
|
||||
'id': article.website.id,
|
||||
'name': article.website.name,
|
||||
},
|
||||
'media_files': article.media_files,
|
||||
})
|
||||
|
||||
response = HttpResponse(
|
||||
json.dumps(articles_data, ensure_ascii=False, indent=2),
|
||||
content_type='application/json'
|
||||
)
|
||||
response['Content-Disposition'] = 'attachment; filename="articles.json"'
|
||||
return response
|
||||
|
||||
elif export_format == 'csv':
|
||||
# 导出为CSV格式
|
||||
output = io.StringIO()
|
||||
writer = csv.writer(output)
|
||||
writer.writerow(['ID', '标题', '网址', '内容', '创建时间', '网站'])
|
||||
|
||||
for article in articles:
|
||||
writer.writerow([
|
||||
article.id,
|
||||
article.title,
|
||||
article.url,
|
||||
article.content[:1000] + '...' if len(article.content) > 1000 else article.content,
|
||||
article.created_at.isoformat(),
|
||||
article.website.name
|
||||
])
|
||||
|
||||
response = HttpResponse(output.getvalue(), content_type='text/csv')
|
||||
response['Content-Disposition'] = 'attachment; filename="articles.csv"'
|
||||
return response
|
||||
|
||||
elif export_format == 'docx':
|
||||
# 导出为Word格式,每个文章一个文件夹
|
||||
zip_buffer = io.BytesIO()
|
||||
with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
|
||||
for article in articles:
|
||||
# 创建文章文件夹名称
|
||||
safe_title = "".join(c for c in article.title if c.isalnum() or c in (' ','_','-')).rstrip()
|
||||
folder_name = f"article_{article.id}_{safe_title}"[:50]
|
||||
|
||||
# 创建Word文档
|
||||
doc = Document()
|
||||
doc.add_heading(article.title, 0)
|
||||
|
||||
# 添加文章信息
|
||||
doc.add_paragraph(f"网站: {article.website.name}")
|
||||
doc.add_paragraph(f"网址: {article.url}")
|
||||
doc.add_paragraph(f"发布时间: {article.pub_date.isoformat() if article.pub_date else 'N/A'}")
|
||||
doc.add_paragraph(f"创建时间: {article.created_at.isoformat()}")
|
||||
|
||||
# 添加内容标题
|
||||
doc.add_heading('内容:', level=1)
|
||||
|
||||
# 处理HTML内容
|
||||
content_text = BeautifulSoup(article.content, 'html.parser').get_text()
|
||||
doc.add_paragraph(content_text)
|
||||
|
||||
# 将文档保存到内存中
|
||||
doc_buffer = io.BytesIO()
|
||||
doc.save(doc_buffer)
|
||||
doc_buffer.seek(0)
|
||||
|
||||
# 添加到ZIP文件
|
||||
zip_file.writestr(f"{folder_name}/article.docx", doc_buffer.getvalue())
|
||||
|
||||
# 添加媒体文件(如果存在)
|
||||
if article.media_files:
|
||||
for media in article.media_files:
|
||||
try:
|
||||
# 如果是本地文件路径
|
||||
if not media.startswith('http'):
|
||||
media_path = os.path.join(settings.MEDIA_ROOT, media.lstrip('/'))
|
||||
if os.path.exists(media_path):
|
||||
zip_file.write(media_path, f"{folder_name}/media/{os.path.basename(media_path)}")
|
||||
# 如果是URL格式的媒体文件
|
||||
else:
|
||||
import requests
|
||||
from io import BytesIO
|
||||
|
||||
response = requests.get(media, timeout=10)
|
||||
if response.status_code == 200:
|
||||
image_stream = BytesIO(response.content)
|
||||
media_filename = f"{folder_name}/media/{os.path.basename(media)}"
|
||||
zip_file.writestr(media_filename, image_stream.getvalue())
|
||||
except Exception:
|
||||
# 忽略无法添加的媒体文件
|
||||
pass
|
||||
|
||||
response = HttpResponse(zip_buffer.getvalue(), content_type='application/zip')
|
||||
response['Content-Disposition'] = 'attachment; filename="articles.zip"'
|
||||
return response
|
||||
|
||||
elif export_format == 'zip':
|
||||
# 导出为ZIP包,每个文章一个文件夹
|
||||
zip_buffer = io.BytesIO()
|
||||
with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
|
||||
for article in articles:
|
||||
# 创建文章文件夹名称
|
||||
safe_title = "".join(c for c in article.title if c.isalnum() or c in (' ','_','-')).rstrip()
|
||||
folder_name = f"article_{article.id}_{safe_title}"[:50]
|
||||
|
||||
# 创建Word文档
|
||||
doc = Document()
|
||||
doc.add_heading(article.title, 0)
|
||||
|
||||
# 添加文章信息
|
||||
doc.add_paragraph(f"网站: {article.website.name}")
|
||||
doc.add_paragraph(f"网址: {article.url}")
|
||||
doc.add_paragraph(f"发布时间: {article.pub_date.isoformat() if article.pub_date else 'N/A'}")
|
||||
doc.add_paragraph(f"创建时间: {article.created_at.isoformat()}")
|
||||
|
||||
# 添加内容标题
|
||||
doc.add_heading('内容:', level=1)
|
||||
|
||||
# 处理HTML内容
|
||||
content_text = BeautifulSoup(article.content, 'html.parser').get_text()
|
||||
doc.add_paragraph(content_text)
|
||||
|
||||
# 将文档保存到内存中
|
||||
doc_buffer = io.BytesIO()
|
||||
doc.save(doc_buffer)
|
||||
doc_buffer.seek(0)
|
||||
|
||||
# 添加到ZIP文件
|
||||
zip_file.writestr(f"{folder_name}/article.docx", doc_buffer.getvalue())
|
||||
|
||||
# 添加媒体文件(如果存在)
|
||||
if article.media_files:
|
||||
for media in article.media_files:
|
||||
try:
|
||||
# 如果是本地文件路径
|
||||
if not media.startswith('http'):
|
||||
media_path = os.path.join(settings.MEDIA_ROOT, media.lstrip('/'))
|
||||
if os.path.exists(media_path):
|
||||
zip_file.write(media_path, f"{folder_name}/media/{os.path.basename(media_path)}")
|
||||
# 如果是URL格式的媒体文件
|
||||
else:
|
||||
import requests
|
||||
from io import BytesIO
|
||||
|
||||
response = requests.get(media, timeout=10)
|
||||
if response.status_code == 200:
|
||||
image_stream = BytesIO(response.content)
|
||||
media_filename = f"{folder_name}/media/{os.path.basename(media)}"
|
||||
zip_file.writestr(media_filename, image_stream.getvalue())
|
||||
except Exception:
|
||||
# 忽略无法添加的媒体文件
|
||||
pass
|
||||
|
||||
response = HttpResponse(zip_buffer.getvalue(), content_type='application/zip')
|
||||
response['Content-Disposition'] = 'attachment; filename="articles.zip"'
|
||||
return response
|
||||
|
||||
else:
|
||||
return api_response(message="不支持的导出格式", status=400, error="Unsupported format")
|
||||
|
||||
except json.JSONDecodeError:
|
||||
return api_response(message="请求数据格式错误", status=400, error="Invalid JSON")
|
||||
except Exception as e:
|
||||
logger.error(f"导出文章失败: {e}")
|
||||
return api_response(message="导出文章失败", status=500, error=str(e))
|
||||
@@ -4,3 +4,8 @@ from django.apps import AppConfig
|
||||
class CoreConfig(AppConfig):
|
||||
default_auto_field = 'django.db.models.BigAutoField'
|
||||
name = 'core'
|
||||
|
||||
def ready(self):
|
||||
"""应用启动时执行"""
|
||||
# 导入Admin扩展
|
||||
import core.admin_extended
|
||||
|
||||
@@ -9,7 +9,7 @@ class Command(BaseCommand):
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--media', type=str, help='指定要爬取的媒体,用逗号分隔')
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
help='指定平台类型: all(全部), web(网站), mobile(移动端)')
|
||||
help='指定平台类型: all(全部), web(网站)')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
media_list = options['media']
|
||||
|
||||
@@ -9,7 +9,7 @@ class Command(BaseCommand):
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
choices=['cctv', 'cctvnews', 'mobile', 'all'],
|
||||
choices=['cctv', 'cctvnews', 'all'],
|
||||
help='选择爬取平台: cctv(央视网), cctvnews(央视新闻), all(全部)')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
|
||||
@@ -3,13 +3,12 @@ from core.models import Website
|
||||
from core.utils import full_site_crawler
|
||||
|
||||
|
||||
# jimmy.fang-20250815: 因URL问题,移除中国网-省份
|
||||
class Command(BaseCommand):
|
||||
help = "全站递归爬取 中国网主网及中国网一省份,不转发二级子网站"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
choices=['china', 'province', 'all'],
|
||||
choices=['china', 'all'],
|
||||
help='选择爬取平台: china(中国网主网), province(中国网一省份), all(全部)')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
@@ -23,12 +22,7 @@ class Command(BaseCommand):
|
||||
'start_url': 'http://www.china.com.cn',
|
||||
'article_selector': 'a'
|
||||
},
|
||||
# 'province': {
|
||||
# 'name': '中国网一省份',
|
||||
# 'base_url': 'http://www.china.com.cn',
|
||||
# 'start_url': 'http://www.china.com.cn/province',
|
||||
# 'article_selector': 'a'
|
||||
# }
|
||||
|
||||
}
|
||||
|
||||
if platform == 'all':
|
||||
|
||||
@@ -8,7 +8,7 @@ class Command(BaseCommand):
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
choices=['chinanews', 'mobile', 'all'],
|
||||
choices=['chinanews', 'all'],
|
||||
help='选择爬取平台: chinanews(中国新闻社), all(全部)')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
|
||||
@@ -50,4 +50,4 @@ class Command(BaseCommand):
|
||||
full_site_crawler(platform_config['start_url'], website, max_pages=500)
|
||||
self.stdout.write(f"完成爬取: {platform_config['name']}")
|
||||
|
||||
self.stdout.write(self.style.SUCCESS("中国政府网所有平台爬取完成"))
|
||||
self.stdout.write(self.style.SUCCESS("中国政府网所有平台爬取完成"))
|
||||
|
||||
@@ -50,4 +50,4 @@ class Command(BaseCommand):
|
||||
full_site_crawler(platform_config['start_url'], website, max_pages=500)
|
||||
self.stdout.write(f"完成爬取: {platform_config['name']}")
|
||||
|
||||
self.stdout.write(self.style.SUCCESS("东方烟草报所有平台爬取完成"))
|
||||
self.stdout.write(self.style.SUCCESS("东方烟草报所有平台爬取完成"))
|
||||
|
||||
@@ -8,7 +8,7 @@ class Command(BaseCommand):
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
choices=['fzrb', 'mobile', 'all'],
|
||||
choices=['fzrb', 'all'],
|
||||
help='选择爬取平台: fzrb(法治日报), all(全部)')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
|
||||
@@ -2,13 +2,14 @@ from django.core.management.base import BaseCommand
|
||||
from core.models import Website
|
||||
from core.utils import full_site_crawler
|
||||
|
||||
# jimmy.fang-20250815: 光明日报反爬,会被阻挡
|
||||
|
||||
# jimmy.fang-20250815: 取消对光明日报的支持,光明日报反爬,被阻挡
|
||||
class Command(BaseCommand):
|
||||
help = "全站递归爬取 光明日报及其子网站、客户端、新媒体平台"
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
choices=['gmrb', 'mobile', 'all'],
|
||||
choices=['gmrb', 'all'],
|
||||
help='选择爬取平台: gmrb(光明日报), all(全部)')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
|
||||
@@ -8,7 +8,7 @@ class Command(BaseCommand):
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
choices=['grrb', 'mobile', 'all'],
|
||||
choices=['grrb', 'all'],
|
||||
help='选择爬取平台: grrb(工人日报), all(全部)')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
|
||||
@@ -8,7 +8,7 @@ class Command(BaseCommand):
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
choices=['jjrb', 'mobile', 'all'],
|
||||
choices=['jjrb', 'all'],
|
||||
help='选择爬取平台: jjrb(经济日报), all(全部)')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
|
||||
@@ -9,7 +9,7 @@ class Command(BaseCommand):
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
choices=['kjrb', 'mobile', 'all'],
|
||||
choices=['kjrb', 'all'],
|
||||
help='选择爬取平台: kjrb(科技日报), all(全部)')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
|
||||
@@ -8,7 +8,7 @@ class Command(BaseCommand):
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
choices=['nmrb', 'mobile', 'all'],
|
||||
choices=['nmrb', 'all'],
|
||||
help='选择爬取平台: nmrb(农民日报), all(全部)')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
|
||||
@@ -8,8 +8,8 @@ class Command(BaseCommand):
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
choices=['pla', 'mobile', 'all'],
|
||||
help='选择爬取平台: pla(解放军报), mobile(移动端), all(全部)')
|
||||
choices=['pla', 'all'],
|
||||
help='选择爬取平台: pla(解放军报), all(全部)')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
platform = options['platform']
|
||||
|
||||
@@ -8,8 +8,8 @@ class Command(BaseCommand):
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
choices=['rmzxb', 'mobile', 'all'],
|
||||
help='选择爬取平台: rmzxb(人民政协网), mobile(移动端), all(全部)')
|
||||
choices=['rmzxb', 'all'],
|
||||
help='选择爬取平台: rmzxb(人民政协网), all(全部)')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
platform = options['platform']
|
||||
|
||||
@@ -8,8 +8,8 @@ class Command(BaseCommand):
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
choices=['news', 'xinhuanet', 'mobile', 'all'],
|
||||
help='选择爬取平台: news(新华网), xinhuanet(新华网主站), mobile(移动端), all(全部)')
|
||||
choices=['news', 'all'],
|
||||
help='选择爬取平台: news(新华网), all(全部)')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
platform = options['platform']
|
||||
@@ -22,18 +22,7 @@ class Command(BaseCommand):
|
||||
'start_url': 'https://www.news.cn',
|
||||
'article_selector': 'a'
|
||||
},
|
||||
'xinhuanet': {
|
||||
'name': '新华网主站',
|
||||
'base_url': 'https://www.xinhuanet.com',
|
||||
'start_url': 'https://www.xinhuanet.com',
|
||||
'article_selector': 'a'
|
||||
},
|
||||
'mobile': {
|
||||
'name': '新华社移动端',
|
||||
'base_url': 'https://m.xinhuanet.com',
|
||||
'start_url': 'https://m.xinhuanet.com',
|
||||
'article_selector': 'a'
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if platform == 'all':
|
||||
|
||||
@@ -8,8 +8,8 @@ class Command(BaseCommand):
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
choices=['xuexi', 'central', 'provincial', 'all'],
|
||||
help='选择爬取平台: xuexi(学习强国主站), central(中央媒体), provincial(省级平台), all(全部)')
|
||||
choices=['xuexi', 'all'],
|
||||
help='选择爬取平台: xuexi(学习强国主站), all(全部)')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
platform = options['platform']
|
||||
@@ -22,18 +22,6 @@ class Command(BaseCommand):
|
||||
'start_url': 'https://www.xuexi.cn',
|
||||
'article_selector': 'a'
|
||||
},
|
||||
'central': {
|
||||
'name': '学习强国中央媒体',
|
||||
'base_url': 'https://www.xuexi.cn',
|
||||
'start_url': 'https://www.xuexi.cn/central',
|
||||
'article_selector': 'a'
|
||||
},
|
||||
'provincial': {
|
||||
'name': '学习强国省级平台',
|
||||
'base_url': 'https://www.xuexi.cn',
|
||||
'start_url': 'https://www.xuexi.cn/provincial',
|
||||
'article_selector': 'a'
|
||||
}
|
||||
}
|
||||
|
||||
if platform == 'all':
|
||||
|
||||
@@ -8,8 +8,8 @@ class Command(BaseCommand):
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
choices=['xxsb', 'mobile', 'all'],
|
||||
help='选择爬取平台: xxsb(学习时报), mobile(移动端), all(全部)')
|
||||
choices=['xxsb', 'all'],
|
||||
help='选择爬取平台: xxsb(学习时报),all(全部)')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
platform = options['platform']
|
||||
@@ -22,12 +22,6 @@ class Command(BaseCommand):
|
||||
'start_url': 'http://www.studytimes.cn',
|
||||
'article_selector': 'a'
|
||||
},
|
||||
'mobile': {
|
||||
'name': '学习时报移动端',
|
||||
'base_url': 'http://m.studytimes.cn',
|
||||
'start_url': 'http://m.studytimes.cn',
|
||||
'article_selector': 'a'
|
||||
}
|
||||
}
|
||||
|
||||
if platform == 'all':
|
||||
|
||||
@@ -8,8 +8,8 @@ class Command(BaseCommand):
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
choices=['zgfnb', 'mobile', 'all'],
|
||||
help='选择爬取平台: zgfnb(中国妇女报), mobile(移动端), all(全部)')
|
||||
choices=['zgfnb', 'all'],
|
||||
help='选择爬取平台: zgfnb(中国妇女报), all(全部)')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
platform = options['platform']
|
||||
@@ -22,12 +22,7 @@ class Command(BaseCommand):
|
||||
'start_url': 'http://www.cnwomen.com.cn',
|
||||
'article_selector': 'a'
|
||||
},
|
||||
'mobile': {
|
||||
'name': '中国妇女报移动端',
|
||||
'base_url': 'http://m.cnwomen.com.cn',
|
||||
'start_url': 'http://m.cnwomen.com.cn',
|
||||
'article_selector': 'a'
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if platform == 'all':
|
||||
|
||||
@@ -8,8 +8,8 @@ class Command(BaseCommand):
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
choices=['zgjwjc', 'mobile', 'all'],
|
||||
help='选择爬取平台: zgjwjc(中国纪检监察报), mobile(移动端), all(全部)')
|
||||
choices=['zgjwjc', 'all'],
|
||||
help='选择爬取平台: zgjwjc(中国纪检监察报),all(全部)')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
platform = options['platform']
|
||||
@@ -18,16 +18,10 @@ class Command(BaseCommand):
|
||||
platforms = {
|
||||
'zgjwjc': {
|
||||
'name': '中国纪检监察报',
|
||||
'base_url': 'http://www.jjjcb.cn',
|
||||
'start_url': 'http://www.jjjcb.cn',
|
||||
'base_url': 'https://jjjcb.ccdi.gov.cn',
|
||||
'start_url': 'https://jjjcb.ccdi.gov.cn',
|
||||
'article_selector': 'a'
|
||||
},
|
||||
'mobile': {
|
||||
'name': '中国纪检监察报移动端',
|
||||
'base_url': 'http://m.jjjcb.cn',
|
||||
'start_url': 'http://m.jjjcb.cn',
|
||||
'article_selector': 'a'
|
||||
}
|
||||
}
|
||||
|
||||
if platform == 'all':
|
||||
|
||||
@@ -8,8 +8,8 @@ class Command(BaseCommand):
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--platform', type=str, default='all',
|
||||
choices=['zgqnb', 'mobile', 'all'],
|
||||
help='选择爬取平台: zgqnb(中国青年报), mobile(移动端), all(全部)')
|
||||
choices=['zgqnb', 'all'],
|
||||
help='选择爬取平台: zgqnb(中国青年报), all(全部)')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
platform = options['platform']
|
||||
@@ -22,12 +22,7 @@ class Command(BaseCommand):
|
||||
'start_url': 'https://www.cyol.com',
|
||||
'article_selector': 'a'
|
||||
},
|
||||
'mobile': {
|
||||
'name': '中国青年报移动端',
|
||||
'base_url': 'https://m.cyol.com',
|
||||
'start_url': 'https://m.cyol.com',
|
||||
'article_selector': 'a'
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if platform == 'all':
|
||||
|
||||
@@ -6,6 +6,10 @@ import os
|
||||
from django.conf import settings
|
||||
import zipfile
|
||||
from django.utils import timezone
|
||||
from bs4 import BeautifulSoup
|
||||
# 添加python-docx库支持
|
||||
import io
|
||||
from docx import Document
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
@@ -119,201 +123,100 @@ class Command(BaseCommand):
|
||||
# 添加Word格式导出方法
|
||||
def export_as_word(self, articles_data, output_path):
|
||||
try:
|
||||
from docx import Document
|
||||
from docx.shared import Inches
|
||||
except ImportError:
|
||||
self.stdout.write(self.style.ERROR('缺少python-docx库,请安装: pip install python-docx'))
|
||||
return
|
||||
|
||||
# 创建Word文档
|
||||
doc = Document()
|
||||
doc.add_heading('文章导出', 0)
|
||||
|
||||
for article_data in articles_data:
|
||||
# 添加文章标题
|
||||
doc.add_heading(article_data['title'], level=1)
|
||||
|
||||
# 添加文章元数据
|
||||
doc.add_paragraph(f"网站: {article_data['website']}")
|
||||
doc.add_paragraph(f"URL: {article_data['url']}")
|
||||
doc.add_paragraph(f"发布时间: {article_data['pub_date']}")
|
||||
doc.add_paragraph(f"创建时间: {article_data['created_at']}")
|
||||
|
||||
# 添加文章内容
|
||||
doc.add_heading('内容', level=2)
|
||||
# 简单处理HTML内容,移除标签
|
||||
from bs4 import BeautifulSoup
|
||||
soup = BeautifulSoup(article_data['content'], 'html.parser')
|
||||
|
||||
# 处理内容中的图片
|
||||
for img in soup.find_all('img'):
|
||||
src = img.get('src', '')
|
||||
if src:
|
||||
# 尝试添加图片到文档
|
||||
try:
|
||||
import os
|
||||
from django.conf import settings
|
||||
import requests
|
||||
from io import BytesIO
|
||||
|
||||
# 构建完整的图片路径
|
||||
if src.startswith('http'):
|
||||
# 网络图片
|
||||
response = requests.get(src, timeout=10)
|
||||
image_stream = BytesIO(response.content)
|
||||
doc.add_picture(image_stream, width=Inches(4.0))
|
||||
else:
|
||||
# 本地图片
|
||||
full_path = os.path.join(settings.MEDIA_ROOT, src.lstrip('/'))
|
||||
if os.path.exists(full_path):
|
||||
doc.add_picture(full_path, width=Inches(4.0))
|
||||
except Exception as e:
|
||||
# 如果添加图片失败,添加图片URL作为文本
|
||||
doc.add_paragraph(f"[图片: {src}]")
|
||||
|
||||
# 移除原始img标签
|
||||
img.decompose()
|
||||
|
||||
content_text = soup.get_text()
|
||||
doc.add_paragraph(content_text)
|
||||
|
||||
# 添加媒体文件信息
|
||||
if article_data['media_files']:
|
||||
doc.add_heading('媒体文件', level=2)
|
||||
for media_file in article_data['media_files']:
|
||||
try:
|
||||
import os
|
||||
from django.conf import settings
|
||||
from io import BytesIO
|
||||
import requests
|
||||
|
||||
full_path = os.path.join(settings.MEDIA_ROOT, media_file)
|
||||
if os.path.exists(full_path):
|
||||
# 添加图片到文档
|
||||
doc.add_picture(full_path, width=Inches(4.0))
|
||||
else:
|
||||
# 如果是URL格式的媒体文件
|
||||
if media_file.startswith('http'):
|
||||
response = requests.get(media_file, timeout=10)
|
||||
image_stream = BytesIO(response.content)
|
||||
doc.add_picture(image_stream, width=Inches(4.0))
|
||||
else:
|
||||
doc.add_paragraph(media_file)
|
||||
except Exception as e:
|
||||
doc.add_paragraph(media_file)
|
||||
|
||||
# 添加分页符
|
||||
doc.add_page_break()
|
||||
|
||||
# 保存文档
|
||||
doc.save(output_path)
|
||||
# 创建一个新的Word文档
|
||||
document = Document()
|
||||
document.add_heading('文章导出', 0)
|
||||
|
||||
for article_data in articles_data:
|
||||
# 添加文章标题
|
||||
document.add_heading(article_data['title'], level=1)
|
||||
|
||||
# 添加文章信息
|
||||
document.add_paragraph(f"网站: {article_data['website']}")
|
||||
document.add_paragraph(f"URL: {article_data['url']}")
|
||||
document.add_paragraph(f"发布时间: {article_data['pub_date']}")
|
||||
document.add_paragraph(f"创建时间: {article_data['created_at']}")
|
||||
|
||||
# 添加内容标题
|
||||
document.add_heading('内容:', level=2)
|
||||
|
||||
# 处理HTML内容,移除标签
|
||||
soup = BeautifulSoup(article_data['content'], 'html.parser')
|
||||
content_text = soup.get_text()
|
||||
document.add_paragraph(content_text)
|
||||
|
||||
# 添加分页符分隔文章
|
||||
document.add_page_break()
|
||||
|
||||
# 保存文档
|
||||
document.save(output_path)
|
||||
self.stdout.write(self.style.SUCCESS(f'成功导出为Word格式: {output_path}'))
|
||||
except Exception as e:
|
||||
self.stdout.write(self.style.ERROR(f'导出Word格式失败: {e}'))
|
||||
|
||||
def export_with_media(self, articles_data, media_files, output_path, format_type):
|
||||
# 创建ZIP文件
|
||||
with zipfile.ZipFile(output_path, 'w') as zipf:
|
||||
# 添加文章数据文件
|
||||
data_filename = f'articles.{format_type}'
|
||||
if format_type == 'json':
|
||||
json_data = json.dumps(articles_data, ensure_ascii=False, indent=2)
|
||||
zipf.writestr(data_filename, json_data)
|
||||
elif format_type == 'csv':
|
||||
# 创建CSV内容
|
||||
if articles_data:
|
||||
import io
|
||||
csv_buffer = io.StringIO()
|
||||
fieldnames = ['id', 'title', 'website', 'url', 'pub_date', 'content', 'created_at', 'media_files']
|
||||
writer = csv.DictWriter(csv_buffer, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
for article_data in articles_data:
|
||||
article_data['media_files'] = ';'.join(article_data['media_files']) if article_data[
|
||||
'media_files'] else ''
|
||||
writer.writerow(article_data)
|
||||
zipf.writestr(data_filename, csv_buffer.getvalue())
|
||||
elif format_type == 'docx':
|
||||
# 创建Word文档并保存到ZIP
|
||||
try:
|
||||
from docx import Document
|
||||
from docx.shared import Inches
|
||||
from io import BytesIO
|
||||
|
||||
doc = Document()
|
||||
doc.add_heading('文章导出', 0)
|
||||
|
||||
for article_data in articles_data:
|
||||
doc.add_heading(article_data['title'], level=1)
|
||||
# 为每篇文章创建独立的文件夹
|
||||
for article_data in articles_data:
|
||||
article_folder = f"article_{article_data['id']}_{article_data['title']}"
|
||||
# 限制文件夹名称长度并移除非法字符
|
||||
article_folder = article_folder[:50].rstrip()
|
||||
article_folder = "".join(c for c in article_folder if c.isalnum() or c in (' ','_','-')).rstrip()
|
||||
|
||||
# 添加文章数据文件
|
||||
if format_type == 'docx':
|
||||
# 创建Word文档并保存到ZIP
|
||||
data_filename = f'{article_folder}/article.docx'
|
||||
try:
|
||||
# 创建文章信息Word文档
|
||||
doc = Document()
|
||||
doc.add_heading(article_data['title'], 0)
|
||||
|
||||
# 添加文章信息
|
||||
doc.add_paragraph(f"网站: {article_data['website']}")
|
||||
doc.add_paragraph(f"URL: {article_data['url']}")
|
||||
doc.add_paragraph(f"发布时间: {article_data['pub_date']}")
|
||||
doc.add_paragraph(f"创建时间: {article_data['created_at']}")
|
||||
|
||||
doc.add_heading('内容', level=2)
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# 添加内容标题
|
||||
doc.add_heading('内容:', level=1)
|
||||
|
||||
# 处理HTML内容
|
||||
soup = BeautifulSoup(article_data['content'], 'html.parser')
|
||||
|
||||
# 处理内容中的图片
|
||||
for img in soup.find_all('img'):
|
||||
src = img.get('src', '')
|
||||
if src:
|
||||
# 尝试添加图片到文档
|
||||
try:
|
||||
import os
|
||||
from django.conf import settings
|
||||
import requests
|
||||
|
||||
# 构建完整的图片路径
|
||||
if src.startswith('http'):
|
||||
# 网络图片
|
||||
response = requests.get(src, timeout=10)
|
||||
image_stream = BytesIO(response.content)
|
||||
doc.add_picture(image_stream, width=Inches(4.0))
|
||||
else:
|
||||
# 本地图片
|
||||
full_path = os.path.join(settings.MEDIA_ROOT, src.lstrip('/'))
|
||||
if os.path.exists(full_path):
|
||||
doc.add_picture(full_path, width=Inches(4.0))
|
||||
except Exception as e:
|
||||
# 如果添加图片失败,添加图片URL作为文本
|
||||
doc.add_paragraph(f"[图片: {src}]")
|
||||
|
||||
# 移除原始img标签
|
||||
img.decompose()
|
||||
|
||||
content_text = soup.get_text()
|
||||
doc.add_paragraph(content_text)
|
||||
|
||||
if article_data['media_files']:
|
||||
doc.add_heading('媒体文件', level=2)
|
||||
for media_file in article_data['media_files']:
|
||||
try:
|
||||
import os
|
||||
from django.conf import settings
|
||||
|
||||
full_path = os.path.join(settings.MEDIA_ROOT, media_file)
|
||||
if os.path.exists(full_path):
|
||||
# 添加图片到文档
|
||||
doc.add_picture(full_path, width=Inches(4.0))
|
||||
else:
|
||||
# 如果是URL格式的媒体文件
|
||||
if media_file.startswith('http'):
|
||||
response = requests.get(media_file, timeout=10)
|
||||
image_stream = BytesIO(response.content)
|
||||
doc.add_picture(image_stream, width=Inches(4.0))
|
||||
else:
|
||||
doc.add_paragraph(media_file)
|
||||
except Exception as e:
|
||||
doc.add_paragraph(media_file)
|
||||
|
||||
doc.add_page_break()
|
||||
|
||||
# 将文档保存到内存中再写入ZIP
|
||||
doc_buffer = BytesIO()
|
||||
doc.save(doc_buffer)
|
||||
doc_buffer.seek(0)
|
||||
zipf.writestr(data_filename, doc_buffer.read())
|
||||
except ImportError:
|
||||
zipf.writestr(data_filename, "错误:缺少python-docx库,无法生成Word文档")
|
||||
|
||||
# 添加媒体文件
|
||||
for media_path in media_files:
|
||||
arcname = os.path.join('media', os.path.relpath(media_path, settings.MEDIA_ROOT))
|
||||
zipf.write(media_path, arcname)
|
||||
|
||||
# 将文档保存到内存中
|
||||
doc_buffer = io.BytesIO()
|
||||
doc.save(doc_buffer)
|
||||
doc_buffer.seek(0)
|
||||
|
||||
# 将文档添加到ZIP文件
|
||||
zipf.writestr(data_filename, doc_buffer.getvalue())
|
||||
except Exception as e:
|
||||
error_msg = f"错误:无法生成文章Word文档 - {str(e)}"
|
||||
zipf.writestr(data_filename, error_msg)
|
||||
|
||||
# 添加媒体文件到文章的media子文件夹
|
||||
if article_data['media_files']:
|
||||
for media_file in article_data['media_files']:
|
||||
try:
|
||||
full_path = os.path.join(settings.MEDIA_ROOT, media_file)
|
||||
if os.path.exists(full_path):
|
||||
# 添加媒体文件到ZIP中的media子文件夹
|
||||
media_filename = f"{article_folder}/media/{os.path.basename(media_file)}"
|
||||
zipf.write(full_path, media_filename)
|
||||
else:
|
||||
# 如果是URL格式的媒体文件
|
||||
if media_file.startswith('http'):
|
||||
import requests
|
||||
from io import BytesIO
|
||||
|
||||
response = requests.get(media_file, timeout=10)
|
||||
image_stream = BytesIO(response.content)
|
||||
media_filename = f"{article_folder}/media/{os.path.basename(media_file)}"
|
||||
zipf.writestr(media_filename, image_stream.getvalue())
|
||||
except Exception as e:
|
||||
# 错误处理,跳过无法添加的文件
|
||||
pass
|
||||
@@ -3,6 +3,7 @@
|
||||
|
||||
{% block object-tools %}
|
||||
{{ block.super }}
|
||||
<!--
|
||||
<div style="margin-top: 10px;">
|
||||
<form method="post" action="{% url 'admin:run_crawler' %}" style="display: inline-block;">
|
||||
{% csrf_token %}
|
||||
@@ -16,4 +17,5 @@
|
||||
<input type="submit" value="执行爬虫" class="default" style="margin-left: 10px;"/>
|
||||
</form>
|
||||
</div>
|
||||
-->
|
||||
{% endblock %}
|
||||
304
core/templates/admin/crawler_status.html
Normal file
304
core/templates/admin/crawler_status.html
Normal file
@@ -0,0 +1,304 @@
|
||||
{% extends "admin/base_site.html" %}
|
||||
{% load static %}
|
||||
|
||||
{% block title %}爬虫状态 - {{ site_title|default:_('Django site admin') }}{% endblock %}
|
||||
|
||||
{% block extrastyle %}
|
||||
<style>
|
||||
.status-card {
|
||||
background: white;
|
||||
border: 1px solid #ddd;
|
||||
border-radius: 8px;
|
||||
padding: 20px;
|
||||
margin: 20px 0;
|
||||
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
|
||||
}
|
||||
|
||||
.status-header {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
margin-bottom: 20px;
|
||||
padding-bottom: 10px;
|
||||
border-bottom: 2px solid #f0f0f0;
|
||||
}
|
||||
|
||||
.status-title {
|
||||
font-size: 24px;
|
||||
font-weight: bold;
|
||||
color: #333;
|
||||
}
|
||||
|
||||
.stats-grid {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
|
||||
gap: 20px;
|
||||
margin-bottom: 30px;
|
||||
}
|
||||
|
||||
.stat-card {
|
||||
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
||||
color: white;
|
||||
padding: 20px;
|
||||
border-radius: 8px;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.stat-number {
|
||||
font-size: 32px;
|
||||
font-weight: bold;
|
||||
margin-bottom: 5px;
|
||||
}
|
||||
|
||||
.stat-label {
|
||||
font-size: 14px;
|
||||
opacity: 0.9;
|
||||
}
|
||||
|
||||
.nodes-section, .batches-section {
|
||||
margin-top: 30px;
|
||||
}
|
||||
|
||||
.section-title {
|
||||
font-size: 20px;
|
||||
font-weight: bold;
|
||||
margin-bottom: 15px;
|
||||
color: #333;
|
||||
}
|
||||
|
||||
.node-item, .batch-item {
|
||||
background: #f8f9fa;
|
||||
border: 1px solid #e9ecef;
|
||||
border-radius: 6px;
|
||||
padding: 15px;
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
|
||||
.node-header, .batch-header {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
|
||||
.node-name, .batch-id {
|
||||
font-weight: bold;
|
||||
color: #333;
|
||||
}
|
||||
|
||||
.node-status, .batch-status {
|
||||
padding: 4px 8px;
|
||||
border-radius: 4px;
|
||||
font-size: 12px;
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
.status-active {
|
||||
background: #d4edda;
|
||||
color: #155724;
|
||||
}
|
||||
|
||||
.status-running {
|
||||
background: #fff3cd;
|
||||
color: #856404;
|
||||
}
|
||||
|
||||
.status-completed {
|
||||
background: #d1ecf1;
|
||||
color: #0c5460;
|
||||
}
|
||||
|
||||
.status-failed {
|
||||
background: #f8d7da;
|
||||
color: #721c24;
|
||||
}
|
||||
|
||||
.node-details, .batch-details {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
|
||||
gap: 10px;
|
||||
font-size: 14px;
|
||||
}
|
||||
|
||||
.detail-item {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
}
|
||||
|
||||
.detail-label {
|
||||
color: #666;
|
||||
}
|
||||
|
||||
.detail-value {
|
||||
font-weight: bold;
|
||||
color: #333;
|
||||
}
|
||||
|
||||
.progress-bar {
|
||||
width: 100%;
|
||||
height: 8px;
|
||||
background: #e9ecef;
|
||||
border-radius: 4px;
|
||||
overflow: hidden;
|
||||
margin-top: 10px;
|
||||
}
|
||||
|
||||
.progress-fill {
|
||||
height: 100%;
|
||||
background: linear-gradient(90deg, #28a745, #20c997);
|
||||
transition: width 0.3s ease;
|
||||
}
|
||||
|
||||
.refresh-btn {
|
||||
background: #007bff;
|
||||
color: white;
|
||||
border: none;
|
||||
padding: 8px 16px;
|
||||
border-radius: 4px;
|
||||
cursor: pointer;
|
||||
font-size: 14px;
|
||||
}
|
||||
|
||||
.refresh-btn:hover {
|
||||
background: #0056b3;
|
||||
}
|
||||
|
||||
.no-data {
|
||||
text-align: center;
|
||||
color: #666;
|
||||
padding: 40px;
|
||||
font-style: italic;
|
||||
}
|
||||
</style>
|
||||
{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<div class="status-card">
|
||||
<div class="status-header">
|
||||
<h1 class="status-title">爬虫状态监控</h1>
|
||||
<button class="refresh-btn" onclick="location.reload()">刷新</button>
|
||||
</div>
|
||||
|
||||
<!-- 统计卡片 -->
|
||||
<div class="stats-grid">
|
||||
<div class="stat-card">
|
||||
<div class="stat-number">{{ task_stats.total_nodes }}</div>
|
||||
<div class="stat-label">活跃节点</div>
|
||||
</div>
|
||||
<div class="stat-card">
|
||||
<div class="stat-number">{{ task_stats.active_tasks }}</div>
|
||||
<div class="stat-label">运行中任务</div>
|
||||
</div>
|
||||
<div class="stat-card">
|
||||
<div class="stat-number">{{ task_stats.total_batches }}</div>
|
||||
<div class="stat-label">总批次</div>
|
||||
</div>
|
||||
<div class="stat-card">
|
||||
<div class="stat-number">{{ nodes|length }}</div>
|
||||
<div class="stat-label">在线节点</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- 节点状态 -->
|
||||
<div class="nodes-section">
|
||||
<h2 class="section-title">爬虫节点状态</h2>
|
||||
{% if nodes %}
|
||||
{% for node in nodes %}
|
||||
<div class="node-item">
|
||||
<div class="node-header">
|
||||
<span class="node-name">{{ node.node_id }}</span>
|
||||
<span class="node-status status-active">{{ node.status }}</span>
|
||||
</div>
|
||||
<div class="node-details">
|
||||
<div class="detail-item">
|
||||
<span class="detail-label">活跃任务:</span>
|
||||
<span class="detail-value">{{ node.active_tasks }}</span>
|
||||
</div>
|
||||
<div class="detail-item">
|
||||
<span class="detail-label">完成任务:</span>
|
||||
<span class="detail-value">{{ node.completed_tasks }}</span>
|
||||
</div>
|
||||
<div class="detail-item">
|
||||
<span class="detail-label">失败任务:</span>
|
||||
<span class="detail-value">{{ node.failed_tasks }}</span>
|
||||
</div>
|
||||
<div class="detail-item">
|
||||
<span class="detail-label">最后心跳:</span>
|
||||
<span class="detail-value">
|
||||
{% if node.last_heartbeat %}
|
||||
{{ node.last_heartbeat|date:"H:i:s" }}
|
||||
{% else %}
|
||||
未知
|
||||
{% endif %}
|
||||
</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{% endfor %}
|
||||
{% else %}
|
||||
<div class="no-data">
|
||||
暂无活跃的爬虫节点
|
||||
</div>
|
||||
{% endif %}
|
||||
</div>
|
||||
|
||||
<!-- 批次状态 -->
|
||||
<div class="batches-section">
|
||||
<h2 class="section-title">最近批次</h2>
|
||||
{% if batches %}
|
||||
{% for batch in batches %}
|
||||
<div class="batch-item">
|
||||
<div class="batch-header">
|
||||
<span class="batch-id">{{ batch.batch_id }}</span>
|
||||
<span class="batch-status status-{{ batch.status }}">
|
||||
{% if batch.status == 'running' %}
|
||||
运行中
|
||||
{% elif batch.status == 'completed' %}
|
||||
已完成
|
||||
{% elif batch.status == 'failed' %}
|
||||
失败
|
||||
{% else %}
|
||||
{{ batch.status }}
|
||||
{% endif %}
|
||||
</span>
|
||||
</div>
|
||||
<div class="batch-details">
|
||||
<div class="detail-item">
|
||||
<span class="detail-label">总任务:</span>
|
||||
<span class="detail-value">{{ batch.total_tasks }}</span>
|
||||
</div>
|
||||
<div class="detail-item">
|
||||
<span class="detail-label">已完成:</span>
|
||||
<span class="detail-value">{{ batch.completed_tasks }}</span>
|
||||
</div>
|
||||
<div class="detail-item">
|
||||
<span class="detail-label">失败:</span>
|
||||
<span class="detail-value">{{ batch.failed_tasks }}</span>
|
||||
</div>
|
||||
<div class="detail-item">
|
||||
<span class="detail-label">进度:</span>
|
||||
<span class="detail-value">{{ batch.progress|floatformat:1 }}%</span>
|
||||
</div>
|
||||
</div>
|
||||
{% if batch.status == 'running' %}
|
||||
<div class="progress-bar">
|
||||
<div class="progress-fill" style="width: {{ batch.progress }}%"></div>
|
||||
</div>
|
||||
{% endif %}
|
||||
</div>
|
||||
{% endfor %}
|
||||
{% else %}
|
||||
<div class="no-data">
|
||||
暂无批次记录
|
||||
</div>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
// 自动刷新页面
|
||||
setTimeout(function () {
|
||||
location.reload();
|
||||
}, 30000); // 30秒刷新一次
|
||||
</script>
|
||||
{% endblock %}
|
||||
@@ -40,7 +40,16 @@
|
||||
margin-top: 20px;
|
||||
}
|
||||
|
||||
.content img {
|
||||
/* 优化:确保图片和视频不会超出容器显示 */
|
||||
.content img, .content video {
|
||||
max-width: 100%;
|
||||
height: auto;
|
||||
display: block;
|
||||
margin: 10px 0;
|
||||
}
|
||||
|
||||
/* 优化:确保iframe也不会超出容器显示 */
|
||||
.content iframe {
|
||||
max-width: 100%;
|
||||
height: auto;
|
||||
}
|
||||
@@ -61,7 +70,7 @@
|
||||
body {
|
||||
padding: 10px;
|
||||
}
|
||||
|
||||
|
||||
.container {
|
||||
padding: 15px;
|
||||
}
|
||||
@@ -69,21 +78,21 @@
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<a href="{% url 'article_list' %}" class="back-link">« 返回文章列表</a>
|
||||
<div class="container">
|
||||
<a href="{% url 'article_list' %}" class="back-link">« 返回文章列表</a>
|
||||
|
||||
<h1>{{ article.title }}</h1>
|
||||
<h1>{{ article.title }}</h1>
|
||||
|
||||
<div class="meta">
|
||||
网站: {{ article.website.name }} |
|
||||
发布时间: {{ article.pub_date|date:"Y-m-d H:i" }} |
|
||||
创建时间: {{ article.created_at|date:"Y-m-d H:i" }} |
|
||||
源网址: <a href="{{ article.url }}" target="_blank">{{ article.url }}</a>
|
||||
</div>
|
||||
|
||||
<div class="content">
|
||||
{{ article.content|safe }}
|
||||
</div>
|
||||
<div class="meta">
|
||||
网站: {{ article.website.name }} |
|
||||
发布时间: {{ article.pub_date|date:"Y-m-d H:i" }} |
|
||||
创建时间: {{ article.created_at|date:"Y-m-d H:i" }} |
|
||||
源网址: <a href="{{ article.url }}" target="_blank">{{ article.url }}</a>
|
||||
</div>
|
||||
|
||||
<div class="content">
|
||||
{{ article.content|safe }}
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
@@ -17,7 +17,7 @@
|
||||
background: white;
|
||||
padding: 30px;
|
||||
margin-bottom: 20px;
|
||||
box-shadow: 0 2px 5px rgba(0,0,0,0.05); /* 添加轻微阴影 */
|
||||
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.05); /* 添加轻微阴影 */
|
||||
border-radius: 8px; /* 添加圆角 */
|
||||
}
|
||||
|
||||
@@ -240,7 +240,7 @@
|
||||
<form method="get">
|
||||
<input type="text" name="q" placeholder="输入关键词搜索文章..." value="{{ search_query }}">
|
||||
{% if selected_website %}
|
||||
<input type="hidden" name="website" value="{{ selected_website.id }}">
|
||||
<input type="hidden" name="website" value="{{ selected_website.id }}">
|
||||
{% endif %}
|
||||
<input type="submit" value="搜索">
|
||||
</form>
|
||||
@@ -251,9 +251,11 @@
|
||||
<div class="sidebar">
|
||||
<div class="filters">
|
||||
<strong>按网站筛选:</strong>
|
||||
<a href="{% url 'article_list' %}{% if search_query %}?q={{ search_query }}{% endif %}" {% if not selected_website %}class="active" {% endif %}>全部</a>
|
||||
<a href="{% url 'article_list' %}{% if search_query %}?q={{ search_query }}{% endif %}"
|
||||
{% if not selected_website %}class="active" {% endif %}>全部</a>
|
||||
{% for website in websites %}
|
||||
<a href="?website={{ website.id }}{% if search_query %}&q={{ search_query }}{% endif %}" {% if selected_website and selected_website.id == website.id %}class="active" {% endif %}>{{ website.name }}</a>
|
||||
<a href="?website={{ website.id }}{% if search_query %}&q={{ search_query }}{% endif %}"
|
||||
{% if selected_website and selected_website.id == website.id %}class="active" {% endif %}>{{ website.name }}</a>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
@@ -262,10 +264,10 @@
|
||||
<div class="main-content">
|
||||
<!-- 新增:搜索结果信息 -->
|
||||
{% if search_query %}
|
||||
<div class="search-info">
|
||||
搜索 "{{ search_query }}" 找到 {{ page_obj.paginator.count }} 篇文章
|
||||
<a href="{% if selected_website %}?website={{ selected_website.id }}{% else %}{% url 'article_list' %}{% endif %}">清除搜索</a>
|
||||
</div>
|
||||
<div class="search-info">
|
||||
搜索 "{{ search_query }}" 找到 {{ page_obj.paginator.count }} 篇文章
|
||||
<a href="{% if selected_website %}?website={{ selected_website.id }}{% else %}{% url 'article_list' %}{% endif %}">清除搜索</a>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
<!-- 新增:导出功能 -->
|
||||
@@ -280,60 +282,70 @@
|
||||
|
||||
<ul>
|
||||
{% for article in page_obj %}
|
||||
<li>
|
||||
<input type="checkbox" class="article-checkbox" value="{{ article.id }}" id="article_{{ article.id }}">
|
||||
<a href="{% url 'article_detail' article.id %}">{{ article.title }}</a>
|
||||
<div class="meta">({{ article.website.name }} - {{ article.created_at|date:"Y-m-d" }})</div>
|
||||
</li>
|
||||
{% empty %}
|
||||
<li>暂无文章</li>
|
||||
<li>
|
||||
<input type="checkbox" class="article-checkbox" value="{{ article.id }}"
|
||||
id="article_{{ article.id }}">
|
||||
<a href="{% url 'article_detail' article.id %}">{{ article.title }}</a>
|
||||
<div class="meta">({{ article.website.name }} - {{ article.created_at|date:"Y-m-d" }})</div>
|
||||
</li>
|
||||
{% empty %}
|
||||
<li>暂无文章</li>
|
||||
{% endfor %}
|
||||
</ul>
|
||||
|
||||
<div class="pagination">
|
||||
{% if page_obj.has_previous %}
|
||||
{% if selected_website %}
|
||||
<a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page=1">« 首页</a>
|
||||
<a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.previous_page_number }}">上一页</a>
|
||||
{% else %}
|
||||
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page=1">« 首页</a>
|
||||
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.previous_page_number }}">上一页</a>
|
||||
{% endif %}
|
||||
{% if selected_website %}
|
||||
<a href="?website=
|
||||
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page=1">«
|
||||
首页</a>
|
||||
<a href="?website=
|
||||
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.previous_page_number }}">上一页</a>
|
||||
{% else %}
|
||||
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page=1">« 首页</a>
|
||||
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.previous_page_number }}">上一页</a>
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
|
||||
<span>第 {{ page_obj.number }} 页,共 {{ page_obj.paginator.num_pages }} 页</span>
|
||||
|
||||
<!-- 修改:优化页码显示逻辑 -->
|
||||
{% with page_obj.paginator as paginator %}
|
||||
{% for num in paginator.page_range %}
|
||||
{% if page_obj.number == num %}
|
||||
<a href="#" class="current">{{ num }}</a>
|
||||
{% elif num > page_obj.number|add:'-3' and num < page_obj.number|add:'3' %}
|
||||
{% if selected_website %}
|
||||
<a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ num }}">{{ num }}</a>
|
||||
{% else %}
|
||||
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ num }}">{{ num }}</a>
|
||||
{% endif %}
|
||||
{% elif num == 1 or num == paginator.num_pages %}
|
||||
{% if selected_website %}
|
||||
<a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ num }}">{{ num }}</a>
|
||||
{% else %}
|
||||
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ num }}">{{ num }}</a>
|
||||
{% endif %}
|
||||
{% elif num == page_obj.number|add:'-3' or num == page_obj.number|add:'3' %}
|
||||
<span class="ellipsis">...</span>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% for num in paginator.page_range %}
|
||||
{% if page_obj.number == num %}
|
||||
<a href="#" class="current">{{ num }}</a>
|
||||
{% elif num > page_obj.number|add:'-3' and num < page_obj.number|add:'3' %}
|
||||
{% if selected_website %}
|
||||
<a href="?website=
|
||||
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ num }}">{{ num }}</a>
|
||||
{% else %}
|
||||
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ num }}">{{ num }}</a>
|
||||
{% endif %}
|
||||
{% elif num == 1 or num == paginator.num_pages %}
|
||||
{% if selected_website %}
|
||||
<a href="?website=
|
||||
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ num }}">{{ num }}</a>
|
||||
{% else %}
|
||||
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ num }}">{{ num }}</a>
|
||||
{% endif %}
|
||||
{% elif num == page_obj.number|add:'-3' or num == page_obj.number|add:'3' %}
|
||||
<span class="ellipsis">...</span>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endwith %}
|
||||
|
||||
{% if page_obj.has_next %}
|
||||
{% if selected_website %}
|
||||
<a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.next_page_number }}">下一页</a>
|
||||
<a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.paginator.num_pages }}">末页 »</a>
|
||||
{% else %}
|
||||
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.next_page_number }}">下一页</a>
|
||||
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.paginator.num_pages }}">末页 »</a>
|
||||
{% endif %}
|
||||
{% if selected_website %}
|
||||
<a href="?website=
|
||||
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.next_page_number }}">下一页</a>
|
||||
<a href="?website=
|
||||
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.paginator.num_pages }}">末页
|
||||
»</a>
|
||||
{% else %}
|
||||
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.next_page_number }}">下一页</a>
|
||||
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.paginator.num_pages }}">末页
|
||||
»</a>
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
@@ -396,25 +408,25 @@
|
||||
format: 'json'
|
||||
})
|
||||
})
|
||||
.then(response => {
|
||||
if (response.ok) {
|
||||
return response.blob();
|
||||
}
|
||||
throw new Error('导出失败');
|
||||
})
|
||||
.then(blob => {
|
||||
const url = window.URL.createObjectURL(blob);
|
||||
const a = document.createElement('a');
|
||||
a.href = url;
|
||||
a.download = 'articles.json';
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
window.URL.revokeObjectURL(url);
|
||||
document.body.removeChild(a);
|
||||
})
|
||||
.catch(error => {
|
||||
alert('导出失败: ' + error);
|
||||
});
|
||||
.then(response => {
|
||||
if (response.ok) {
|
||||
return response.blob();
|
||||
}
|
||||
throw new Error('导出失败');
|
||||
})
|
||||
.then(blob => {
|
||||
const url = window.URL.createObjectURL(blob);
|
||||
const a = document.createElement('a');
|
||||
a.href = url;
|
||||
a.download = 'articles.json';
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
window.URL.revokeObjectURL(url);
|
||||
document.body.removeChild(a);
|
||||
})
|
||||
.catch(error => {
|
||||
alert('导出失败: ' + error);
|
||||
});
|
||||
});
|
||||
|
||||
// 导出为CSV功能
|
||||
@@ -434,25 +446,25 @@
|
||||
format: 'csv'
|
||||
})
|
||||
})
|
||||
.then(response => {
|
||||
if (response.ok) {
|
||||
return response.blob();
|
||||
}
|
||||
throw new Error('导出失败');
|
||||
})
|
||||
.then(blob => {
|
||||
const url = window.URL.createObjectURL(blob);
|
||||
const a = document.createElement('a');
|
||||
a.href = url;
|
||||
a.download = 'articles.csv';
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
window.URL.revokeObjectURL(url);
|
||||
document.body.removeChild(a);
|
||||
})
|
||||
.catch(error => {
|
||||
alert('导出失败: ' + error);
|
||||
});
|
||||
.then(response => {
|
||||
if (response.ok) {
|
||||
return response.blob();
|
||||
}
|
||||
throw new Error('导出失败');
|
||||
})
|
||||
.then(blob => {
|
||||
const url = window.URL.createObjectURL(blob);
|
||||
const a = document.createElement('a');
|
||||
a.href = url;
|
||||
a.download = 'articles.csv';
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
window.URL.revokeObjectURL(url);
|
||||
document.body.removeChild(a);
|
||||
})
|
||||
.catch(error => {
|
||||
alert('导出失败: ' + error);
|
||||
});
|
||||
});
|
||||
|
||||
// 新增:导出为ZIP包功能
|
||||
@@ -472,25 +484,25 @@
|
||||
format: 'zip' // 指定导出格式为ZIP
|
||||
})
|
||||
})
|
||||
.then(response => {
|
||||
if (response.ok) {
|
||||
return response.blob();
|
||||
}
|
||||
throw new Error('导出失败');
|
||||
})
|
||||
.then(blob => {
|
||||
const url = window.URL.createObjectURL(blob);
|
||||
const a = document.createElement('a');
|
||||
a.href = url;
|
||||
a.download = 'articles.zip';
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
window.URL.revokeObjectURL(url);
|
||||
document.body.removeChild(a);
|
||||
})
|
||||
.catch(error => {
|
||||
alert('导出失败: ' + error);
|
||||
});
|
||||
.then(response => {
|
||||
if (response.ok) {
|
||||
return response.blob();
|
||||
}
|
||||
throw new Error('导出失败');
|
||||
})
|
||||
.then(blob => {
|
||||
const url = window.URL.createObjectURL(blob);
|
||||
const a = document.createElement('a');
|
||||
a.href = url;
|
||||
a.download = 'articles.zip';
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
window.URL.revokeObjectURL(url);
|
||||
document.body.removeChild(a);
|
||||
})
|
||||
.catch(error => {
|
||||
alert('导出失败: ' + error);
|
||||
});
|
||||
});
|
||||
|
||||
// 初始化导出按钮状态
|
||||
|
||||
313
core/tests.py
313
core/tests.py
@@ -1,3 +1,312 @@
|
||||
from django.test import TestCase
|
||||
import os
|
||||
import tempfile
|
||||
import shutil
|
||||
from django.test import TestCase, override_settings
|
||||
from django.core.management import call_command
|
||||
from django.core.management.base import CommandError
|
||||
from django.utils import timezone
|
||||
from django.core.files.uploadedfile import SimpleUploadedFile
|
||||
from unittest.mock import patch, MagicMock
|
||||
from .models import Website, Article
|
||||
from .utils import process_article, download_media, is_valid_url, full_site_crawler
|
||||
from .tasks import crawl_website, crawl_all_websites, health_check
|
||||
|
||||
# Create your tests here.
|
||||
|
||||
class WebsiteModelTest(TestCase):
|
||||
"""网站模型测试"""
|
||||
|
||||
def setUp(self):
|
||||
self.website = Website.objects.create(
|
||||
name='测试网站',
|
||||
base_url='https://test.com',
|
||||
description='测试描述'
|
||||
)
|
||||
|
||||
def test_website_creation(self):
|
||||
"""测试网站创建"""
|
||||
self.assertEqual(self.website.name, '测试网站')
|
||||
self.assertEqual(self.website.base_url, 'https://test.com')
|
||||
self.assertTrue(self.website.enabled)
|
||||
|
||||
def test_website_str(self):
|
||||
"""测试网站字符串表示"""
|
||||
self.assertEqual(str(self.website), '测试网站')
|
||||
|
||||
|
||||
class ArticleModelTest(TestCase):
|
||||
"""文章模型测试"""
|
||||
|
||||
def setUp(self):
|
||||
self.website = Website.objects.create(
|
||||
name='测试网站',
|
||||
base_url='https://test.com'
|
||||
)
|
||||
self.article = Article.objects.create(
|
||||
website=self.website,
|
||||
title='测试文章',
|
||||
url='https://test.com/article/1',
|
||||
content='<p>测试内容</p>',
|
||||
media_files=['image1.jpg', 'image2.jpg']
|
||||
)
|
||||
|
||||
def test_article_creation(self):
|
||||
"""测试文章创建"""
|
||||
self.assertEqual(self.article.title, '测试文章')
|
||||
self.assertEqual(self.article.url, 'https://test.com/article/1')
|
||||
self.assertEqual(len(self.article.media_files), 2)
|
||||
|
||||
def test_article_str(self):
|
||||
"""测试文章字符串表示"""
|
||||
self.assertEqual(str(self.article), '测试文章')
|
||||
|
||||
|
||||
class UtilsTest(TestCase):
|
||||
"""工具函数测试"""
|
||||
|
||||
def setUp(self):
|
||||
self.website = Website.objects.create(
|
||||
name='测试网站',
|
||||
base_url='https://test.com'
|
||||
)
|
||||
self.temp_dir = tempfile.mkdtemp()
|
||||
|
||||
def tearDown(self):
|
||||
shutil.rmtree(self.temp_dir)
|
||||
|
||||
def test_is_valid_url(self):
|
||||
"""测试URL验证"""
|
||||
from .utils import is_valid_url
|
||||
|
||||
# 有效URL
|
||||
self.assertTrue(is_valid_url('https://test.com/article', 'test.com'))
|
||||
self.assertTrue(is_valid_url('http://test.com/article', 'test.com'))
|
||||
|
||||
# 无效URL
|
||||
self.assertFalse(is_valid_url('https://other.com/article', 'test.com'))
|
||||
self.assertFalse(is_valid_url('ftp://test.com/article', 'test.com'))
|
||||
self.assertFalse(is_valid_url('invalid-url', 'test.com'))
|
||||
|
||||
@patch('core.utils.requests.get')
|
||||
def test_download_media(self, mock_get):
|
||||
"""测试媒体下载"""
|
||||
# 模拟响应
|
||||
mock_response = MagicMock()
|
||||
mock_response.content = b'fake image content'
|
||||
mock_response.headers = {'content-type': 'image/jpeg'}
|
||||
mock_get.return_value = mock_response
|
||||
|
||||
# 测试下载
|
||||
result = download_media('https://test.com/image.jpg', self.temp_dir)
|
||||
self.assertIsNotNone(result)
|
||||
self.assertTrue(os.path.exists(result))
|
||||
|
||||
@patch('core.utils.requests.get')
|
||||
@patch('core.utils.download_media')
|
||||
def test_process_article_success(self, mock_download_media, mock_get):
|
||||
"""测试文章处理成功"""
|
||||
# 模拟HTML响应
|
||||
html_content = '''
|
||||
<html>
|
||||
<head><title>测试文章</title></head>
|
||||
<body>
|
||||
<h1>测试文章标题</h1>
|
||||
<div class="content">
|
||||
<p>测试文章内容</p>
|
||||
<img src="https://test.com/image.jpg">
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
|
||||
mock_response = MagicMock()
|
||||
mock_response.text = html_content
|
||||
mock_response.encoding = 'utf-8'
|
||||
mock_response.raise_for_status.return_value = None
|
||||
mock_get.return_value = mock_response
|
||||
|
||||
# 模拟媒体下载
|
||||
mock_download_media.return_value = '/tmp/test_image.jpg'
|
||||
|
||||
# 测试文章处理
|
||||
process_article('https://test.com/article/1', self.website)
|
||||
|
||||
# 验证文章是否保存
|
||||
article = Article.objects.filter(url='https://test.com/article/1').first()
|
||||
self.assertIsNotNone(article)
|
||||
self.assertEqual(article.title, '测试文章标题')
|
||||
|
||||
|
||||
class ManagementCommandsTest(TestCase):
|
||||
"""管理命令测试"""
|
||||
|
||||
def setUp(self):
|
||||
self.website = Website.objects.create(
|
||||
name='测试网站',
|
||||
base_url='https://test.com'
|
||||
)
|
||||
|
||||
@patch('core.management.commands.crawl_all_media.call_command')
|
||||
def test_crawl_all_media_command(self, mock_call_command):
|
||||
"""测试批量爬取命令"""
|
||||
# 模拟命令执行
|
||||
mock_call_command.return_value = None
|
||||
|
||||
# 执行命令
|
||||
call_command('crawl_all_media', media='rmrb,xinhua')
|
||||
|
||||
# 验证命令被调用
|
||||
mock_call_command.assert_called()
|
||||
|
||||
|
||||
class CeleryTasksTest(TestCase):
|
||||
"""Celery任务测试"""
|
||||
|
||||
def setUp(self):
|
||||
self.website = Website.objects.create(
|
||||
name='测试网站',
|
||||
base_url='https://test.com'
|
||||
)
|
||||
|
||||
@patch('core.tasks.full_site_crawler')
|
||||
def test_crawl_website_task(self, mock_crawler):
|
||||
"""测试单个网站爬取任务"""
|
||||
# 模拟爬虫函数
|
||||
mock_crawler.return_value = None
|
||||
|
||||
# 执行任务
|
||||
result = crawl_website(self.website.id)
|
||||
|
||||
# 验证结果
|
||||
self.assertEqual(result['website_id'], self.website.id)
|
||||
self.assertEqual(result['website_name'], '测试网站')
|
||||
self.assertEqual(result['status'], 'success')
|
||||
|
||||
def test_crawl_website_task_invalid_id(self):
|
||||
"""测试无效网站ID的任务"""
|
||||
# 执行任务
|
||||
with self.assertRaises(Exception):
|
||||
crawl_website(99999)
|
||||
|
||||
@patch('core.tasks.crawl_website.delay')
|
||||
def test_crawl_all_websites_task(self, mock_delay):
|
||||
"""测试批量爬取任务"""
|
||||
# 模拟子任务
|
||||
mock_result = MagicMock()
|
||||
mock_result.id = 'task-123'
|
||||
mock_delay.return_value = mock_result
|
||||
|
||||
# 执行任务
|
||||
result = crawl_all_websites()
|
||||
|
||||
# 验证结果
|
||||
self.assertEqual(result['total_websites'], 1)
|
||||
self.assertEqual(result['status'], 'started')
|
||||
|
||||
def test_health_check_task(self):
|
||||
"""测试健康检查任务"""
|
||||
# 执行任务
|
||||
result = health_check()
|
||||
|
||||
# 验证结果
|
||||
self.assertEqual(result['database'], 'ok')
|
||||
self.assertEqual(result['website_count'], 1)
|
||||
self.assertEqual(result['article_count'], 0)
|
||||
|
||||
|
||||
class IntegrationTest(TestCase):
|
||||
"""集成测试"""
|
||||
|
||||
def setUp(self):
|
||||
self.website = Website.objects.create(
|
||||
name='集成测试网站',
|
||||
base_url='https://integration-test.com'
|
||||
)
|
||||
|
||||
def test_full_workflow(self):
|
||||
"""测试完整工作流程"""
|
||||
# 1. 创建网站
|
||||
self.assertEqual(Website.objects.count(), 1)
|
||||
|
||||
# 2. 创建文章
|
||||
article = Article.objects.create(
|
||||
website=self.website,
|
||||
title='集成测试文章',
|
||||
url='https://integration-test.com/article/1',
|
||||
content='<p>集成测试内容</p>'
|
||||
)
|
||||
|
||||
# 3. 验证关联关系
|
||||
self.assertEqual(article.website, self.website)
|
||||
self.assertEqual(self.website.article_set.count(), 1)
|
||||
|
||||
# 4. 验证数据完整性
|
||||
self.assertIsNotNone(article.created_at)
|
||||
self.assertIsInstance(article.media_files, list)
|
||||
|
||||
|
||||
@override_settings(MEDIA_ROOT=tempfile.mkdtemp())
|
||||
class MediaHandlingTest(TestCase):
|
||||
"""媒体文件处理测试"""
|
||||
|
||||
def setUp(self):
|
||||
self.website = Website.objects.create(
|
||||
name='媒体测试网站',
|
||||
base_url='https://media-test.com'
|
||||
)
|
||||
|
||||
def test_media_files_field(self):
|
||||
"""测试媒体文件字段"""
|
||||
article = Article.objects.create(
|
||||
website=self.website,
|
||||
title='媒体测试文章',
|
||||
url='https://media-test.com/article/1',
|
||||
content='<p>测试内容</p>',
|
||||
media_files=['image1.jpg', 'video1.mp4']
|
||||
)
|
||||
|
||||
# 验证媒体文件列表
|
||||
self.assertEqual(len(article.media_files), 2)
|
||||
self.assertIn('image1.jpg', article.media_files)
|
||||
self.assertIn('video1.mp4', article.media_files)
|
||||
|
||||
|
||||
class ErrorHandlingTest(TestCase):
|
||||
"""错误处理测试"""
|
||||
|
||||
def test_duplicate_url_handling(self):
|
||||
"""测试重复URL处理"""
|
||||
website = Website.objects.create(
|
||||
name='错误测试网站',
|
||||
base_url='https://error-test.com'
|
||||
)
|
||||
|
||||
# 创建第一篇文章
|
||||
article1 = Article.objects.create(
|
||||
website=website,
|
||||
title='第一篇文章',
|
||||
url='https://error-test.com/article/1',
|
||||
content='<p>内容1</p>'
|
||||
)
|
||||
|
||||
# 尝试创建相同URL的文章
|
||||
with self.assertRaises(Exception):
|
||||
Article.objects.create(
|
||||
website=website,
|
||||
title='第二篇文章',
|
||||
url='https://error-test.com/article/1', # 相同URL
|
||||
content='<p>内容2</p>'
|
||||
)
|
||||
|
||||
def test_invalid_website_data(self):
|
||||
"""测试无效网站数据"""
|
||||
# 测试重复名称(unique约束)
|
||||
Website.objects.create(
|
||||
name='测试网站1',
|
||||
base_url='https://test1.com'
|
||||
)
|
||||
|
||||
with self.assertRaises(Exception):
|
||||
Website.objects.create(
|
||||
name='测试网站1', # 重复名称
|
||||
base_url='https://test2.com'
|
||||
)
|
||||
|
||||
33
core/urls.py
33
core/urls.py
@@ -1,17 +1,24 @@
|
||||
from django.urls import path, include
|
||||
from . import views
|
||||
# 添加以下导入
|
||||
from django.contrib import admin
|
||||
from django.urls import path
|
||||
from . import views, api
|
||||
|
||||
urlpatterns = [
|
||||
# 原有视图
|
||||
path('', views.article_list, name='article_list'),
|
||||
path('article/<int:article_id>/', views.article_detail, name='article_detail'),
|
||||
path('run-crawler/', views.run_crawler, name='run_crawler'),
|
||||
# 新增:检查爬虫状态的路由
|
||||
path('crawler-status/', views.crawler_status, name='crawler_status'),
|
||||
# 新增:暂停爬虫的路由
|
||||
path('pause-crawler/', views.pause_crawler, name='pause_crawler'),
|
||||
# 添加导出文章的路由
|
||||
path('export-articles/', views.export_articles, name='export_articles'),
|
||||
# 添加自定义管理后台的路由
|
||||
]
|
||||
|
||||
# API接口
|
||||
path('api/health/', api.HealthView.as_view(), name='api_health'),
|
||||
path('api/websites/', api.WebsitesView.as_view(), name='api_websites'),
|
||||
path('api/websites/<int:website_id>/', api.api_website_detail, name='api_website_detail'),
|
||||
path('api/websites/<int:website_id>/crawl/', api.api_crawl_website, name='api_crawl_website'),
|
||||
path('api/articles/', api.api_articles, name='api_articles'),
|
||||
path('api/articles/<int:article_id>/', api.api_article_detail, name='api_article_detail'),
|
||||
path('api/crawler/status/', api.api_crawler_status, name='api_crawler_status'),
|
||||
path('api/crawler/distributed/', api.api_start_distributed_crawl, name='api_start_distributed_crawl'),
|
||||
path('api/crawler/batch/<str:batch_id>/', api.api_batch_status, name='api_batch_status'),
|
||||
path('api/cleanup/', api.api_cleanup_articles, name='api_cleanup_articles'),
|
||||
path('api/stats/', api.api_stats, name='api_stats'),
|
||||
|
||||
# 添加导出文章的URL
|
||||
path('api/export/', api.export_articles, name='export_articles'),
|
||||
]
|
||||
106
core/utils.py
106
core/utils.py
@@ -26,8 +26,9 @@ def get_selenium_driver():
|
||||
chrome_options.add_argument("--disable-dev-shm-usage")
|
||||
chrome_options.add_argument("--disable-gpu")
|
||||
chrome_options.add_argument("--window-size=1920,1080")
|
||||
chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
|
||||
|
||||
chrome_options.add_argument(
|
||||
"--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
|
||||
|
||||
service = Service(ChromeDriverManager().install())
|
||||
driver = webdriver.Chrome(service=service, options=chrome_options)
|
||||
return driver
|
||||
@@ -35,6 +36,7 @@ def get_selenium_driver():
|
||||
print(f"创建Selenium WebDriver失败: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def get_page_with_selenium(url, website_name):
|
||||
"""使用Selenium获取动态加载的页面内容"""
|
||||
driver = None
|
||||
@@ -42,17 +44,17 @@ def get_page_with_selenium(url, website_name):
|
||||
driver = get_selenium_driver()
|
||||
if not driver:
|
||||
return None
|
||||
|
||||
|
||||
print(f"使用Selenium加载页面: {url}")
|
||||
driver.get(url)
|
||||
|
||||
|
||||
# 等待页面加载完成
|
||||
wait_time = 10
|
||||
if "学习强国" in website_name:
|
||||
wait_time = 15 # 学习强国需要更长时间
|
||||
elif "法治日报" in website_name:
|
||||
wait_time = 12 # 法治日报需要较长时间
|
||||
|
||||
|
||||
# 等待页面主要内容加载
|
||||
try:
|
||||
WebDriverWait(driver, wait_time).until(
|
||||
@@ -60,14 +62,14 @@ def get_page_with_selenium(url, website_name):
|
||||
)
|
||||
except:
|
||||
print(f"等待页面加载超时: {url}")
|
||||
|
||||
|
||||
# 额外等待时间确保动态内容加载完成
|
||||
time.sleep(3)
|
||||
|
||||
|
||||
# 获取页面源码
|
||||
page_source = driver.page_source
|
||||
return page_source
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"Selenium获取页面失败: {url}, 错误: {e}")
|
||||
return None
|
||||
@@ -78,6 +80,7 @@ def get_page_with_selenium(url, website_name):
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
def download_media(url, save_dir):
|
||||
try:
|
||||
# 添加请求头以避免403 Forbidden错误
|
||||
@@ -236,7 +239,7 @@ def process_article(url, website):
|
||||
need_selenium = False
|
||||
if any(name in website.name for name in ["学习强国", "xuexi", "法治日报", "legaldaily"]):
|
||||
need_selenium = True
|
||||
|
||||
|
||||
try:
|
||||
if need_selenium:
|
||||
# 使用Selenium获取动态加载的内容
|
||||
@@ -244,28 +247,28 @@ def process_article(url, website):
|
||||
if not page_source:
|
||||
print(f"Selenium获取页面失败:{url}")
|
||||
return
|
||||
|
||||
|
||||
# 检查页面内容是否过短
|
||||
min_length = 100 if "法治日报" in website.name else 300
|
||||
if len(page_source) < min_length:
|
||||
print(f"页面内容过短,可能是重定向页面:{url}")
|
||||
return
|
||||
|
||||
|
||||
# 创建BeautifulSoup对象
|
||||
soup = BeautifulSoup(page_source, "html.parser")
|
||||
else:
|
||||
# 使用requests获取静态内容
|
||||
resp = requests.get(url, headers=headers, timeout=15, allow_redirects=True)
|
||||
resp.raise_for_status()
|
||||
|
||||
|
||||
# 检查是否是重定向页面
|
||||
if len(resp.text) < 300:
|
||||
print(f"页面内容过短,可能是重定向页面:{url}")
|
||||
return
|
||||
|
||||
|
||||
# 创建BeautifulSoup对象
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"请求失败:{url},错误:{e}")
|
||||
return
|
||||
@@ -353,7 +356,7 @@ def process_article(url, website):
|
||||
heading_text = heading.get_text(strip=True)
|
||||
if title_text in heading_text or heading_text in title_text:
|
||||
heading.decompose()
|
||||
|
||||
|
||||
# 移除class包含title的元素
|
||||
for title_element in content_tag.find_all(class_=lambda x: x and "title" in x):
|
||||
title_element_text = title_element.get_text(strip=True)
|
||||
@@ -489,13 +492,13 @@ def process_article(url, website):
|
||||
soup.find("p", class_="title") or
|
||||
soup.find("title")
|
||||
)
|
||||
|
||||
|
||||
# 针对求是的特殊处理,如果标题为空或太短,尝试从title标签提取
|
||||
if title_tag:
|
||||
title_text = title_tag.get_text(strip=True)
|
||||
if not title_text or len(title_text) < 5:
|
||||
title_tag = soup.find("title")
|
||||
|
||||
|
||||
# 针对求是的特殊处理,确保标题被正确提取
|
||||
if not title_tag or not title_tag.get_text(strip=True):
|
||||
title_tag = soup.find("title")
|
||||
@@ -522,7 +525,7 @@ def process_article(url, website):
|
||||
# 如果 strong 在正文前两段内,就删除
|
||||
if parent_p in content_tag.find_all("p")[:2]:
|
||||
strong_tag.decompose()
|
||||
|
||||
|
||||
# 移除h1、h2、h3标题元素中的重复标题
|
||||
for heading in content_tag.find_all(["h1", "h2", "h3"]):
|
||||
heading_text = heading.get_text(strip=True)
|
||||
@@ -530,11 +533,12 @@ def process_article(url, website):
|
||||
# 确保不删除title_tag本身
|
||||
if heading != title_tag:
|
||||
heading.decompose()
|
||||
|
||||
|
||||
# 移除class包含title的元素
|
||||
for title_element in content_tag.find_all(class_=lambda x: x and "title" in x):
|
||||
title_element_text = title_element.get_text(strip=True)
|
||||
if title_element_text and (title_text in title_element_text or title_element_text in title_text):
|
||||
if title_element_text and (
|
||||
title_text in title_element_text or title_element_text in title_text):
|
||||
# 确保不删除title_tag本身
|
||||
if title_element != title_tag:
|
||||
title_element.decompose()
|
||||
@@ -583,7 +587,7 @@ def process_article(url, website):
|
||||
soup.find("h2") or # 解放军报使用h2标签作为标题
|
||||
soup.find("title")
|
||||
)
|
||||
|
||||
|
||||
# 针对解放军报的特殊处理,如果标题为空或太短,尝试从title标签提取
|
||||
if title_tag:
|
||||
title_text = title_tag.get_text(strip=True)
|
||||
@@ -606,34 +610,34 @@ def process_article(url, website):
|
||||
# 移除面包屑导航
|
||||
for breadcrumb in content_tag.find_all("ol", class_="breadcrumb"):
|
||||
breadcrumb.decompose()
|
||||
|
||||
|
||||
# 移除分享相关元素
|
||||
for share_element in content_tag.find_all("div", class_="share-custom"):
|
||||
share_element.decompose()
|
||||
|
||||
|
||||
# 移除作者信息段落
|
||||
for author_p in content_tag.find_all("p"):
|
||||
text = author_p.get_text(strip=True)
|
||||
if "来源:" in text or "作者:" in text or "责任编辑:" in text or "发布:" in text:
|
||||
author_p.decompose()
|
||||
|
||||
|
||||
# 移除进度条
|
||||
for progress in content_tag.find_all("div", class_="progress-bar"):
|
||||
progress.decompose()
|
||||
|
||||
|
||||
# 移除播放器
|
||||
for player in content_tag.find_all("div", class_="player"):
|
||||
player.decompose()
|
||||
|
||||
|
||||
# 移除媒体URL容器
|
||||
for media in content_tag.find_all("div", id="mediaurl"):
|
||||
media.decompose()
|
||||
|
||||
|
||||
# 移除新闻列表(但保留其中的内容)
|
||||
for news_list in content_tag.find_all("ul", id="main-news-list"):
|
||||
# 不删除整个ul,而是unwrap它,保留其中的内容
|
||||
news_list.unwrap()
|
||||
|
||||
|
||||
# 移除编辑信息
|
||||
for editor_element in content_tag.find_all("div", class_="editor"):
|
||||
editor_element.decompose()
|
||||
@@ -744,13 +748,13 @@ def process_article(url, website):
|
||||
soup.find("h1") or
|
||||
soup.find("title")
|
||||
)
|
||||
|
||||
|
||||
# 针对工人日报的特殊处理,如果标题为空或太短,尝试从title标签提取
|
||||
if title_tag:
|
||||
title_text = title_tag.get_text(strip=True)
|
||||
if not title_text or len(title_text) < 5:
|
||||
title_tag = soup.find("title")
|
||||
|
||||
|
||||
# 进一步处理:如果h1标题包含太多无关信息,尝试从title标签提取更简洁的标题
|
||||
if title_tag and title_tag.name == 'h1':
|
||||
title_text = title_tag.get_text(strip=True)
|
||||
@@ -877,11 +881,11 @@ def process_article(url, website):
|
||||
soup.find("h1") or
|
||||
soup.find("title")
|
||||
)
|
||||
|
||||
|
||||
# 针对中国纪检监察报的特殊处理,确保标题被正确提取
|
||||
if not title_tag or not title_tag.get_text(strip=True):
|
||||
title_tag = soup.find("title")
|
||||
|
||||
|
||||
content_tag = (
|
||||
soup.find("div", class_="content") or
|
||||
soup.find("div", class_="article-content") or
|
||||
@@ -955,11 +959,11 @@ def process_article(url, website):
|
||||
soup.find("h1") or
|
||||
soup.find("title")
|
||||
)
|
||||
|
||||
|
||||
# 针对中国青年报的特殊处理,确保标题被正确提取
|
||||
if not title_tag or not title_tag.get_text(strip=True):
|
||||
title_tag = soup.find("title")
|
||||
|
||||
|
||||
content_tag = (
|
||||
soup.find("div", class_="main") or # 中国青年报特有内容容器
|
||||
soup.find("div", class_="content") or
|
||||
@@ -977,11 +981,11 @@ def process_article(url, website):
|
||||
soup.find("h1") or
|
||||
soup.find("title")
|
||||
)
|
||||
|
||||
|
||||
# 针对中国妇女报的特殊处理,确保标题被正确提取
|
||||
if not title_tag or not title_tag.get_text(strip=True):
|
||||
title_tag = soup.find("title")
|
||||
|
||||
|
||||
content_tag = (
|
||||
soup.find("div", class_="main") or # 中国妇女报特有内容容器
|
||||
soup.find("div", class_="news") or # 中国妇女报特有内容容器
|
||||
@@ -1001,11 +1005,11 @@ def process_article(url, website):
|
||||
soup.find("h1") or
|
||||
soup.find("title")
|
||||
)
|
||||
|
||||
|
||||
# 针对法治日报的特殊处理,确保标题被正确提取
|
||||
if not title_tag or not title_tag.get_text(strip=True):
|
||||
title_tag = soup.find("title")
|
||||
|
||||
|
||||
content_tag = (
|
||||
soup.find("div", class_="content-two") or # 优先查找content-two类
|
||||
soup.find("div", class_="article-content") or # 法治日报特有内容容器
|
||||
@@ -1058,13 +1062,13 @@ def process_article(url, website):
|
||||
soup.find("h1") or
|
||||
soup.find("title")
|
||||
)
|
||||
|
||||
|
||||
# 针对农民日报的特殊处理,如果标题出现乱码,尝试从title标签提取
|
||||
if title_tag and title_tag.name == 'h1':
|
||||
title_text = title_tag.get_text(strip=True)
|
||||
if title_text and any(char in title_text for char in ['', '', '']):
|
||||
title_tag = soup.find("title")
|
||||
|
||||
|
||||
# 针对农民日报的特殊处理,确保标题被正确提取
|
||||
if not title_tag or not title_tag.get_text(strip=True):
|
||||
title_tag = soup.find("title")
|
||||
@@ -1078,7 +1082,7 @@ def process_article(url, website):
|
||||
soup.find("div", class_="article") or
|
||||
soup.find("div", class_="article-body")
|
||||
)
|
||||
|
||||
|
||||
# 针对农民日报的特殊处理,如果找到多个detailCon,选择内容最长的那个
|
||||
if content_tag and content_tag.get('class') and 'detailCon' in content_tag.get('class', []):
|
||||
detail_cons = soup.find_all("div", class_="detailCon")
|
||||
@@ -1116,17 +1120,17 @@ def process_article(url, website):
|
||||
soup.find("h1") or
|
||||
soup.find("title")
|
||||
)
|
||||
|
||||
|
||||
# 针对学习强国的特殊处理,确保标题被正确提取
|
||||
if not title_tag or not title_tag.get_text(strip=True):
|
||||
title_tag = soup.find("title")
|
||||
|
||||
|
||||
# 针对学习强国的特殊处理,如果标题太短,尝试从title标签提取
|
||||
if title_tag:
|
||||
title_text = title_tag.get_text(strip=True)
|
||||
if title_text and len(title_text) < 10:
|
||||
title_tag = soup.find("title")
|
||||
|
||||
|
||||
content_tag = (
|
||||
soup.find("div", class_="content") or
|
||||
soup.find("div", class_="article-content") or
|
||||
@@ -1153,17 +1157,17 @@ def process_article(url, website):
|
||||
soup.find("h1") or
|
||||
soup.find("title")
|
||||
)
|
||||
|
||||
|
||||
# 针对旗帜网的特殊处理,如果标题为空或太短,尝试从title标签提取
|
||||
if title_tag:
|
||||
title_text = title_tag.get_text(strip=True)
|
||||
if not title_text or len(title_text) < 5:
|
||||
title_tag = soup.find("title")
|
||||
|
||||
|
||||
# 针对旗帜网的特殊处理,确保标题被正确提取
|
||||
if not title_tag or not title_tag.get_text(strip=True):
|
||||
title_tag = soup.find("title")
|
||||
|
||||
|
||||
# 针对旗帜网的特殊处理,如果标题太短,尝试从title标签提取
|
||||
if title_tag:
|
||||
title_text = title_tag.get_text(strip=True)
|
||||
@@ -1232,13 +1236,13 @@ def process_article(url, website):
|
||||
soup.find("h1") or
|
||||
soup.find("title")
|
||||
)
|
||||
|
||||
|
||||
# 针对中国网的特殊处理,如果标题为空或太短,尝试从title标签提取
|
||||
if title_tag:
|
||||
title_text = title_tag.get_text(strip=True)
|
||||
if not title_text or len(title_text) < 5:
|
||||
title_tag = soup.find("title")
|
||||
|
||||
|
||||
content_tag = (
|
||||
soup.find("div", class_="article") or # 中国网特有内容容器
|
||||
soup.find("div", class_="main") or
|
||||
@@ -1281,7 +1285,7 @@ def process_article(url, website):
|
||||
# 最终标题处理 - 只有在没有网站特定处理时才使用默认处理
|
||||
if not title_tag:
|
||||
title_tag = soup.find("h1") or soup.find("title")
|
||||
|
||||
|
||||
title = title_tag.get_text(strip=True) if title_tag else "无标题"
|
||||
|
||||
# 对标题进行额外处理,去除可能的多余空白字符
|
||||
@@ -1564,7 +1568,7 @@ def full_site_crawler(start_url, website, max_pages=1000):
|
||||
("/content/" in path) or
|
||||
(path.startswith("/detail/") and len(path) > 10)
|
||||
)
|
||||
|
||||
|
||||
# 排除列表页面
|
||||
if "/index.html" in path or path.endswith("/"):
|
||||
is_article_page = False
|
||||
|
||||
@@ -412,4 +412,4 @@ def export_articles(request):
|
||||
return HttpResponse('不支持的格式', status=400)
|
||||
|
||||
except Exception as e:
|
||||
return HttpResponse(f'导出失败: {str(e)}', status=500)
|
||||
return HttpResponse(f'导出失败: {str(e)}', status=500)
|
||||
|
||||
@@ -10,7 +10,12 @@ For the full list of settings and their values, see
|
||||
https://docs.djangoproject.com/en/5.1/ref/settings/
|
||||
"""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
from dotenv import load_dotenv
|
||||
|
||||
# 加载环境变量
|
||||
load_dotenv()
|
||||
|
||||
# Build paths inside the project like this: BASE_DIR / 'subdir'.
|
||||
BASE_DIR = Path(__file__).resolve().parent.parent
|
||||
@@ -19,12 +24,12 @@ BASE_DIR = Path(__file__).resolve().parent.parent
|
||||
# See https://docs.djangoproject.com/en/5.1/howto/deployment/checklist/
|
||||
|
||||
# SECURITY WARNING: keep the secret key used in production secret!
|
||||
SECRET_KEY = 'django-insecure-_kr!&5j#i!)lo(=u-&5ni+21cwxcq)j-35k!ne20)fyx!u6dnl'
|
||||
SECRET_KEY = os.getenv('SECRET_KEY', 'django-insecure-_kr!&5j#i!)lo(=u-&5ni+21cwxcq)j-35k!ne20)fyx!u6dnl')
|
||||
|
||||
# SECURITY WARNING: don't run with debug turned on in production!
|
||||
DEBUG = True
|
||||
DEBUG = os.getenv('DEBUG', 'True').lower() == 'true'
|
||||
|
||||
ALLOWED_HOSTS = []
|
||||
ALLOWED_HOSTS = os.getenv('ALLOWED_HOSTS', 'localhost,127.0.0.1').split(',')
|
||||
|
||||
# Application definition
|
||||
|
||||
@@ -36,8 +41,15 @@ INSTALLED_APPS = [
|
||||
'django.contrib.messages',
|
||||
'django.contrib.staticfiles',
|
||||
'core',
|
||||
'django_celery_beat',
|
||||
'django_celery_results',
|
||||
'rest_framework',
|
||||
'rest_framework.authtoken',
|
||||
]
|
||||
|
||||
# 导入Admin扩展
|
||||
# import core.admin_extended # 暂时注释,避免循环导入
|
||||
|
||||
MIDDLEWARE = [
|
||||
'django.middleware.security.SecurityMiddleware',
|
||||
'django.contrib.sessions.middleware.SessionMiddleware',
|
||||
@@ -71,12 +83,30 @@ WSGI_APPLICATION = 'green_classroom.wsgi.application'
|
||||
# Database
|
||||
# https://docs.djangoproject.com/en/5.1/ref/settings/#databases
|
||||
|
||||
DATABASES = {
|
||||
'default': {
|
||||
'ENGINE': 'django.db.backends.sqlite3',
|
||||
'NAME': BASE_DIR / 'db.sqlite3',
|
||||
# 根据环境变量选择数据库
|
||||
DB_ENGINE = os.getenv('DB_ENGINE', 'django.db.backends.sqlite3')
|
||||
|
||||
if DB_ENGINE == 'django.db.backends.postgresql':
|
||||
DATABASES = {
|
||||
'default': {
|
||||
'ENGINE': DB_ENGINE,
|
||||
'NAME': os.getenv('DB_NAME', 'green_classroom'),
|
||||
'USER': os.getenv('DB_USER', 'postgres'),
|
||||
'PASSWORD': os.getenv('DB_PASSWORD', ''),
|
||||
'HOST': os.getenv('DB_HOST', 'localhost'),
|
||||
'PORT': os.getenv('DB_PORT', '5432'),
|
||||
'OPTIONS': {
|
||||
'charset': 'utf8mb4',
|
||||
},
|
||||
}
|
||||
}
|
||||
else:
|
||||
DATABASES = {
|
||||
'default': {
|
||||
'ENGINE': 'django.db.backends.sqlite3',
|
||||
'NAME': BASE_DIR / 'db.sqlite3',
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Password validation
|
||||
# https://docs.djangoproject.com/en/5.1/ref/settings/#auth-password-validators
|
||||
@@ -110,17 +140,118 @@ USE_TZ = True
|
||||
# Static files (CSS, JavaScript, Images)
|
||||
# https://docs.djangoproject.com/en/5.1/howto/static-files/
|
||||
|
||||
STATIC_URL = 'static/'
|
||||
STATIC_URL = '/static/'
|
||||
STATIC_ROOT = os.getenv('STATIC_ROOT', os.path.join(BASE_DIR, 'data', 'static'))
|
||||
|
||||
# Default primary key field type
|
||||
# https://docs.djangoproject.com/en/5.1/ref/settings/#default-auto-field
|
||||
|
||||
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
|
||||
|
||||
|
||||
import os
|
||||
|
||||
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
MEDIA_ROOT = os.path.join(BASE_DIR, 'date', 'media')
|
||||
# 媒体文件配置
|
||||
MEDIA_ROOT = os.getenv('MEDIA_ROOT', os.path.join(BASE_DIR, 'data', 'media'))
|
||||
MEDIA_URL = '/media/'
|
||||
|
||||
# Celery配置
|
||||
CELERY_BROKER_URL = os.getenv('CELERY_BROKER_URL', 'redis://localhost:6379/0')
|
||||
CELERY_RESULT_BACKEND = os.getenv('CELERY_RESULT_BACKEND', 'redis://localhost:6379/0')
|
||||
CELERY_ACCEPT_CONTENT = ['json']
|
||||
CELERY_TASK_SERIALIZER = 'json'
|
||||
CELERY_RESULT_SERIALIZER = 'json'
|
||||
CELERY_TIMEZONE = TIME_ZONE
|
||||
CELERY_TASK_TRACK_STARTED = True
|
||||
CELERY_TASK_TIME_LIMIT = 30 * 60 # 30分钟
|
||||
|
||||
# Redis配置
|
||||
REDIS_URL = os.getenv('REDIS_URL', 'redis://localhost:6379/0')
|
||||
|
||||
# 日志配置
|
||||
LOGGING = {
|
||||
'version': 1,
|
||||
'disable_existing_loggers': False,
|
||||
'formatters': {
|
||||
'verbose': {
|
||||
'format': '{levelname} {asctime} {module} {process:d} {thread:d} {message}',
|
||||
'style': '{',
|
||||
},
|
||||
'simple': {
|
||||
'format': '{levelname} {message}',
|
||||
'style': '{',
|
||||
},
|
||||
},
|
||||
'handlers': {
|
||||
'file': {
|
||||
'level': os.getenv('LOG_LEVEL', 'INFO'),
|
||||
'class': 'logging.FileHandler',
|
||||
'filename': os.getenv('LOG_FILE', os.path.join(BASE_DIR, 'data', 'logs', 'django.log')),
|
||||
'formatter': 'verbose',
|
||||
},
|
||||
'console': {
|
||||
'level': os.getenv('LOG_LEVEL', 'INFO'),
|
||||
'class': 'logging.StreamHandler',
|
||||
'formatter': 'simple',
|
||||
},
|
||||
},
|
||||
'root': {
|
||||
'handlers': ['console', 'file'],
|
||||
'level': os.getenv('LOG_LEVEL', 'INFO'),
|
||||
},
|
||||
'loggers': {
|
||||
'django': {
|
||||
'handlers': ['console', 'file'],
|
||||
'level': os.getenv('LOG_LEVEL', 'INFO'),
|
||||
'propagate': False,
|
||||
},
|
||||
'core': {
|
||||
'handlers': ['console', 'file'],
|
||||
'level': os.getenv('LOG_LEVEL', 'INFO'),
|
||||
'propagate': False,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
# 安全设置
|
||||
if not DEBUG:
|
||||
SECURE_BROWSER_XSS_FILTER = True
|
||||
SECURE_CONTENT_TYPE_NOSNIFF = True
|
||||
X_FRAME_OPTIONS = 'DENY'
|
||||
SECURE_HSTS_SECONDS = 31536000
|
||||
SECURE_HSTS_INCLUDE_SUBDOMAINS = True
|
||||
SECURE_HSTS_PRELOAD = True
|
||||
|
||||
# 爬虫设置
|
||||
CRAWLER_TIMEOUT = int(os.getenv('CRAWLER_TIMEOUT', 30))
|
||||
CRAWLER_MAX_RETRIES = int(os.getenv('CRAWLER_MAX_RETRIES', 3))
|
||||
CRAWLER_DELAY = int(os.getenv('CRAWLER_DELAY', 1))
|
||||
|
||||
# Selenium设置
|
||||
SELENIUM_HEADLESS = os.getenv('SELENIUM_HEADLESS', 'True').lower() == 'true'
|
||||
CHROME_DRIVER_PATH = os.getenv('CHROME_DRIVER_PATH', '/usr/bin/chromedriver')
|
||||
|
||||
# Sentry监控(可选)
|
||||
SENTRY_DSN = os.getenv('SENTRY_DSN')
|
||||
if SENTRY_DSN:
|
||||
import sentry_sdk
|
||||
from sentry_sdk.integrations.django import DjangoIntegration
|
||||
|
||||
sentry_sdk.init(
|
||||
dsn=SENTRY_DSN,
|
||||
integrations=[DjangoIntegration()],
|
||||
traces_sample_rate=1.0,
|
||||
send_default_pii=True
|
||||
)
|
||||
|
||||
# Django REST Framework 配置
|
||||
REST_FRAMEWORK = {
|
||||
'DEFAULT_RENDERER_CLASSES': [
|
||||
'rest_framework.renderers.JSONRenderer',
|
||||
'rest_framework.renderers.BrowsableAPIRenderer',
|
||||
],
|
||||
'DEFAULT_PERMISSION_CLASSES': [
|
||||
'rest_framework.permissions.IsAuthenticated',
|
||||
],
|
||||
'DEFAULT_AUTHENTICATION_CLASSES': [
|
||||
'rest_framework.authentication.SessionAuthentication',
|
||||
'rest_framework.authentication.TokenAuthentication',
|
||||
],
|
||||
}
|
||||
@@ -1,31 +1,80 @@
|
||||
amqp==5.3.1
|
||||
asgiref==3.9.1
|
||||
asttokens==3.0.0
|
||||
attrs==25.3.0
|
||||
beautifulsoup4==4.13.4
|
||||
billiard==4.2.1
|
||||
bs4==0.0.2
|
||||
celery==5.5.3
|
||||
certifi==2025.8.3
|
||||
charset-normalizer==3.4.3
|
||||
click==8.2.1
|
||||
click-didyoumean==0.3.1
|
||||
click-plugins==1.1.1.2
|
||||
click-repl==0.3.0
|
||||
coverage==7.10.3
|
||||
cron-descriptor==1.4.5
|
||||
decorator==5.2.1
|
||||
Django==5.1
|
||||
django-celery-beat==2.8.1
|
||||
django-db-connection-pool==1.2.6
|
||||
django-timezone-field==7.1
|
||||
django_celery_results==2.6.0
|
||||
djangorestframework==3.16.1
|
||||
executing==2.2.0
|
||||
factory_boy==3.3.3
|
||||
Faker==37.5.3
|
||||
h11==0.16.0
|
||||
idna==3.10
|
||||
iniconfig==2.1.0
|
||||
ipython==9.4.0
|
||||
ipython_pygments_lexers==1.1.1
|
||||
jedi==0.19.2
|
||||
kombu==5.5.4
|
||||
lxml==6.0.0
|
||||
m3u8==6.0.0
|
||||
matplotlib-inline==0.1.7
|
||||
outcome==1.3.0.post0
|
||||
packaging==25.0
|
||||
parso==0.8.4
|
||||
pexpect==4.9.0
|
||||
pluggy==1.6.0
|
||||
prompt_toolkit==3.0.51
|
||||
psycopg2-binary==2.9.10
|
||||
ptyprocess==0.7.0
|
||||
pure_eval==0.2.3
|
||||
pycryptodome==3.23.0
|
||||
Pygments==2.19.2
|
||||
PySocks==1.7.1
|
||||
pytest==8.4.1
|
||||
pytest-cov==6.2.1
|
||||
pytest-django==4.11.1
|
||||
python-crontab==3.3.0
|
||||
python-dateutil==2.9.0.post0
|
||||
python-docx==1.2.0
|
||||
python-dotenv==1.1.1
|
||||
redis==6.4.0
|
||||
requests==2.32.4
|
||||
selenium==4.34.2
|
||||
sentry-sdk==2.35.0
|
||||
six==1.17.0
|
||||
sniffio==1.3.1
|
||||
sortedcontainers==2.4.0
|
||||
soupsieve==2.7
|
||||
SQLAlchemy==2.0.43
|
||||
sqlparams==6.2.0
|
||||
sqlparse==0.5.3
|
||||
stack-data==0.6.3
|
||||
tqdm==4.67.1
|
||||
traitlets==5.14.3
|
||||
trio==0.30.0
|
||||
trio-websocket==0.12.2
|
||||
typing_extensions==4.14.1
|
||||
tzdata==2025.2
|
||||
urllib3==2.5.0
|
||||
uv==0.8.8
|
||||
vine==5.1.0
|
||||
wcwidth==0.2.13
|
||||
webdriver-manager==4.0.2
|
||||
websocket-client==1.8.0
|
||||
wsproto==1.2.0
|
||||
|
||||
Reference in New Issue
Block a user