Add export into front
This commit is contained in:
233
core/admin.py
233
core/admin.py
@@ -8,24 +8,53 @@ from django.http import HttpResponseRedirect
|
||||
import csv
|
||||
from django.http import HttpResponse
|
||||
import json
|
||||
# 添加视图函数需要的导入
|
||||
from django.shortcuts import render, redirect
|
||||
from django.urls import path
|
||||
from django.contrib import admin
|
||||
from django.http import JsonResponse
|
||||
from django.views.decorators.http import require_http_methods
|
||||
from django.core.management import call_command
|
||||
import threading
|
||||
import uuid
|
||||
from django.utils import timezone
|
||||
|
||||
|
||||
# 创建自定义管理站点
|
||||
class NewsCnAdminSite(AdminSite):
|
||||
site_header = "新华网管理后台"
|
||||
site_title = "新华网管理"
|
||||
index_title = "新华网内容管理"
|
||||
|
||||
|
||||
class DongfangyancaoAdminSite(AdminSite):
|
||||
site_header = "东方烟草报管理后台"
|
||||
site_title = "东方烟草报管理"
|
||||
index_title = "东方烟草报内容管理"
|
||||
|
||||
|
||||
# 实例化管理站点
|
||||
news_cn_admin = NewsCnAdminSite(name='news_cn_admin')
|
||||
dongfangyancao_admin = DongfangyancaoAdminSite(name='dongfangyancao_admin')
|
||||
|
||||
# 添加运行爬虫的视图函数
|
||||
def run_crawler_view(request):
|
||||
"""
|
||||
管理后台运行爬虫的视图
|
||||
"""
|
||||
if request.method == 'POST':
|
||||
website_name = request.POST.get('website_name')
|
||||
if not website_name:
|
||||
messages.error(request, '请选择要爬取的网站')
|
||||
return redirect('admin:core_article_changelist')
|
||||
|
||||
try:
|
||||
# 根据网站名称确定要执行的爬虫命令
|
||||
if website_name == 'crawl_xinhua':
|
||||
crawler_name = 'crawl_xinhua'
|
||||
elif website_name == 'crawl_dongfangyancao':
|
||||
crawler_name = 'crawl_dongfangyancao'
|
||||
elif website_name == 'crawl_articles':
|
||||
crawler_name = 'crawl_articles'
|
||||
else:
|
||||
# 对于其他网站,使用通用爬虫命令
|
||||
crawler_name = 'crawl_articles'
|
||||
|
||||
# 运行爬虫命令,不传递website_name作为参数
|
||||
call_command(crawler_name)
|
||||
|
||||
messages.success(request, f'成功执行爬虫: {crawler_name}')
|
||||
except Exception as e:
|
||||
messages.error(request, f'执行爬虫失败: {str(e)}')
|
||||
|
||||
return redirect('admin:core_article_changelist')
|
||||
|
||||
|
||||
@admin.register(Website)
|
||||
@@ -39,22 +68,16 @@ class ArticleAdmin(admin.ModelAdmin):
|
||||
list_display = ('title', 'website', 'pub_date')
|
||||
search_fields = ('title', 'content')
|
||||
# 添加动作选项
|
||||
actions = ['delete_selected_articles', 'delete_dongfangyancao_articles', 'export_as_csv', 'export_as_json',
|
||||
'export_as_word']
|
||||
actions = ['delete_selected_articles', 'export_as_csv', 'export_as_json',
|
||||
'export_as_word', 'export_with_media']
|
||||
|
||||
def delete_dongfangyancao_articles(self, request, queryset):
|
||||
"""一键删除东方烟草报的所有文章"""
|
||||
# 获取东方烟草报网站对象
|
||||
try:
|
||||
dongfangyancao_website = Website.objects.get(name='东方烟草报')
|
||||
# 删除所有东方烟草报的文章
|
||||
deleted_count = Article.objects.filter(website=dongfangyancao_website).delete()[0]
|
||||
self.message_user(request, f"成功删除 {deleted_count} 篇东方烟草报文章", messages.SUCCESS)
|
||||
except Website.DoesNotExist:
|
||||
self.message_user(request, "未找到东方烟草报网站配置", messages.ERROR)
|
||||
|
||||
# 设置动作的显示名称
|
||||
delete_dongfangyancao_articles.short_description = "删除所有东方烟草报文章"
|
||||
# 重写get_urls方法,添加自定义URL
|
||||
def get_urls(self):
|
||||
urls = super().get_urls()
|
||||
custom_urls = [
|
||||
path('run-crawler/', self.admin_site.admin_view(run_crawler_view), name='run_crawler'),
|
||||
]
|
||||
return custom_urls + urls
|
||||
|
||||
def export_as_csv(self, request, queryset):
|
||||
"""导出选中的文章为CSV格式"""
|
||||
@@ -205,6 +228,155 @@ class ArticleAdmin(admin.ModelAdmin):
|
||||
|
||||
export_as_word.short_description = "导出选中文章为Word格式"
|
||||
|
||||
def export_with_media(self, request, queryset):
|
||||
"""导出选中的文章及媒体文件为ZIP包"""
|
||||
try:
|
||||
from docx import Document
|
||||
from io import BytesIO
|
||||
from docx.shared import Inches
|
||||
import zipfile
|
||||
except ImportError:
|
||||
self.message_user(request, "缺少必要库,请安装: pip install python-docx", messages.ERROR)
|
||||
return
|
||||
|
||||
# 创建内存中的ZIP文件
|
||||
zip_buffer = BytesIO()
|
||||
|
||||
with zipfile.ZipFile(zip_buffer, 'w') as zip_file:
|
||||
for article in queryset:
|
||||
# 为每篇文章创建单独的文件夹
|
||||
article_folder = f"article_{article.id}_{article.title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('*', '_').replace('?', '_').replace('"', '_').replace('<', '_').replace('>', '_').replace('|', '_')}"
|
||||
|
||||
# 创建Word文档
|
||||
doc = Document()
|
||||
doc.add_heading(article.title, 0)
|
||||
|
||||
# 添加文章元数据
|
||||
doc.add_paragraph(f"网站: {article.website.name}")
|
||||
doc.add_paragraph(f"URL: {article.url}")
|
||||
doc.add_paragraph(
|
||||
f"发布时间: {article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else 'N/A'}")
|
||||
doc.add_paragraph(f"创建时间: {article.created_at.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
|
||||
# 添加文章内容
|
||||
doc.add_heading('内容', level=2)
|
||||
# 简单处理HTML内容,移除标签并处理图片
|
||||
from bs4 import BeautifulSoup
|
||||
soup = BeautifulSoup(article.content, 'html.parser')
|
||||
|
||||
# 处理内容中的图片
|
||||
for img in soup.find_all('img'):
|
||||
src = img.get('src', '')
|
||||
if src:
|
||||
# 尝试添加图片到文档
|
||||
try:
|
||||
import os
|
||||
from django.conf import settings
|
||||
import requests
|
||||
|
||||
# 构建完整的图片路径
|
||||
if src.startswith('http'):
|
||||
# 网络图片
|
||||
response = requests.get(src, timeout=10)
|
||||
image_stream = BytesIO(response.content)
|
||||
doc.add_picture(image_stream, width=Inches(4.0))
|
||||
# 将网络文件保存到ZIP
|
||||
zip_file.writestr(os.path.join(article_folder, 'media', os.path.basename(src)), response.content)
|
||||
else:
|
||||
# 本地图片
|
||||
full_path = os.path.join(settings.MEDIA_ROOT, src.lstrip('/'))
|
||||
if os.path.exists(full_path):
|
||||
doc.add_picture(full_path, width=Inches(4.0))
|
||||
# 添加文件到ZIP包
|
||||
zip_file.write(full_path, os.path.join(article_folder, 'media', src.lstrip('/')))
|
||||
except Exception as e:
|
||||
# 如果添加图片失败,添加图片URL作为文本
|
||||
doc.add_paragraph(f"[图片: {src}]")
|
||||
|
||||
# 移除原始img标签
|
||||
img.decompose()
|
||||
|
||||
content_text = soup.get_text()
|
||||
doc.add_paragraph(content_text)
|
||||
|
||||
# 添加媒体文件信息并打包媒体文件
|
||||
if article.media_files:
|
||||
doc.add_heading('媒体文件', level=2)
|
||||
for media_file in article.media_files:
|
||||
try:
|
||||
import os
|
||||
from django.conf import settings
|
||||
|
||||
full_path = os.path.join(settings.MEDIA_ROOT, media_file)
|
||||
# 检查文件扩展名以确定处理方式
|
||||
file_extension = os.path.splitext(media_file)[1].lower()
|
||||
|
||||
# 图片文件处理
|
||||
if file_extension in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']:
|
||||
if os.path.exists(full_path):
|
||||
# 添加图片到文档
|
||||
doc.add_picture(full_path, width=Inches(4.0))
|
||||
# 添加文件到ZIP包
|
||||
zip_file.write(full_path, os.path.join(article_folder, 'media', media_file))
|
||||
else:
|
||||
# 如果是URL格式的媒体文件
|
||||
if media_file.startswith('http'):
|
||||
response = requests.get(media_file, timeout=10)
|
||||
image_stream = BytesIO(response.content)
|
||||
doc.add_picture(image_stream, width=Inches(4.0))
|
||||
# 将网络文件保存到ZIP
|
||||
zip_file.writestr(os.path.join(article_folder, 'media', os.path.basename(media_file)), response.content)
|
||||
else:
|
||||
doc.add_paragraph(media_file)
|
||||
# 视频文件处理
|
||||
elif file_extension in ['.mp4', '.avi', '.mov', '.wmv', '.flv', '.webm']:
|
||||
# 视频文件只添加到ZIP包中,不在Word文档中显示
|
||||
if os.path.exists(full_path):
|
||||
# 添加文件到ZIP包
|
||||
zip_file.write(full_path, os.path.join(article_folder, 'media', media_file))
|
||||
# 在Word文档中添加视频文件信息
|
||||
doc.add_paragraph(f"[视频文件: {media_file}]")
|
||||
else:
|
||||
# 如果是URL格式的媒体文件
|
||||
if media_file.startswith('http'):
|
||||
# 将网络文件保存到ZIP
|
||||
response = requests.get(media_file, timeout=10)
|
||||
zip_file.writestr(os.path.join(article_folder, 'media', os.path.basename(media_file)), response.content)
|
||||
doc.add_paragraph(f"[视频文件: {media_file}]")
|
||||
else:
|
||||
doc.add_paragraph(media_file)
|
||||
# 其他文件类型
|
||||
else:
|
||||
if os.path.exists(full_path):
|
||||
# 添加文件到ZIP包
|
||||
zip_file.write(full_path, os.path.join(article_folder, 'media', media_file))
|
||||
doc.add_paragraph(f"[文件: {media_file}]")
|
||||
else:
|
||||
# 如果是URL格式的媒体文件
|
||||
if media_file.startswith('http'):
|
||||
response = requests.get(media_file, timeout=10)
|
||||
zip_file.writestr(os.path.join(article_folder, 'media', os.path.basename(media_file)), response.content)
|
||||
doc.add_paragraph(f"[文件: {media_file}]")
|
||||
else:
|
||||
doc.add_paragraph(media_file)
|
||||
except Exception as e:
|
||||
doc.add_paragraph(media_file)
|
||||
|
||||
# 保存每篇文章的Word文档到ZIP文件中的对应文件夹
|
||||
doc_buffer = BytesIO()
|
||||
doc.save(doc_buffer)
|
||||
doc_buffer.seek(0)
|
||||
zip_file.writestr(os.path.join(article_folder, f'{article.title.replace("/", "_")}.docx'), doc_buffer.read())
|
||||
|
||||
# 创建HttpResponse
|
||||
zip_buffer.seek(0)
|
||||
from django.http import HttpResponse
|
||||
response = HttpResponse(zip_buffer.getvalue(), content_type='application/zip')
|
||||
response['Content-Disposition'] = 'attachment; filename=articles_export.zip'
|
||||
return response
|
||||
|
||||
export_with_media.short_description = "导出选中文章及媒体文件(ZIP包)"
|
||||
|
||||
|
||||
# 为不同网站创建专门的文章管理类
|
||||
class NewsCnArticleAdmin(admin.ModelAdmin):
|
||||
@@ -342,8 +514,3 @@ class DongfangyancaoArticleAdmin(admin.ModelAdmin):
|
||||
|
||||
|
||||
# 在各自的管理站点中注册模型
|
||||
news_cn_admin.register(Website, WebsiteAdmin)
|
||||
news_cn_admin.register(Article, NewsCnArticleAdmin)
|
||||
|
||||
dongfangyancao_admin.register(Website, WebsiteAdmin)
|
||||
dongfangyancao_admin.register(Article, DongfangyancaoArticleAdmin)
|
||||
|
||||
@@ -1,21 +0,0 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
from core.models import Website
|
||||
from core.utils import crawl_xinhua_list
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = '批量爬取新华网文章'
|
||||
|
||||
def handle(self, *args, **options):
|
||||
# 添加使用标记,确认该命令是否被调用
|
||||
self.stdout.write(self.style.WARNING("crawl_xinhua command is being used"))
|
||||
|
||||
list_url = "https://www.news.cn/legal/index.html"
|
||||
try:
|
||||
website = Website.objects.get(base_url="https://www.news.cn/")
|
||||
except Website.DoesNotExist:
|
||||
self.stdout.write(self.style.ERROR("网站 https://www.news.cn/ 不存在,请先后台添加"))
|
||||
return
|
||||
|
||||
self.stdout.write(f"开始爬取文章列表页: {list_url}")
|
||||
crawl_xinhua_list(list_url, website)
|
||||
self.stdout.write(self.style.SUCCESS("批量爬取完成"))
|
||||
@@ -13,16 +13,20 @@ class Command(BaseCommand):
|
||||
help = '导出文章及相关的媒体文件(图片、视频等)'
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--format', type=str, default='json', help='导出格式: json 或 csv')
|
||||
parser.add_argument('--format', type=str, default='docx', help='导出格式: json、csv 或 docx')
|
||||
parser.add_argument('--website', type=str, help='指定网站名称导出特定网站的文章')
|
||||
parser.add_argument('--output', type=str, default='', help='输出文件路径')
|
||||
parser.add_argument('--include-media', action='store_true', help='包含媒体文件')
|
||||
# 修改默认值为True,使包含媒体文件成为默认行为
|
||||
parser.add_argument('--include-media', action='store_true', default=True, help='包含媒体文件')
|
||||
# 添加参数控制是否打包成zip
|
||||
parser.add_argument('--no-zip', action='store_true', help='不打包成zip文件')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
format_type = options['format'].lower()
|
||||
website_name = options['website']
|
||||
output_path = options['output']
|
||||
include_media = options['include_media']
|
||||
no_zip = options['no_zip']
|
||||
|
||||
# 获取文章查询集
|
||||
articles = Article.objects.all()
|
||||
@@ -65,20 +69,26 @@ class Command(BaseCommand):
|
||||
# 确定输出路径
|
||||
if not output_path:
|
||||
timestamp = timezone.now().strftime('%Y%m%d_%H%M%S')
|
||||
if include_media:
|
||||
# 默认导出为zip格式
|
||||
output_path = f'articles_export_{timestamp}.zip'
|
||||
else:
|
||||
output_path = f'articles_export_{timestamp}.{format_type}'
|
||||
|
||||
# 执行导出
|
||||
if include_media:
|
||||
# 如果需要包含媒体文件或格式为docx,则默认打包成zip
|
||||
if include_media or format_type == 'docx':
|
||||
if no_zip:
|
||||
if format_type == 'docx':
|
||||
self.export_as_word(articles_data, output_path)
|
||||
elif format_type == 'json':
|
||||
self.export_as_json(articles_data, output_path)
|
||||
elif format_type == 'csv':
|
||||
self.export_as_csv(articles_data, output_path)
|
||||
else:
|
||||
self.export_with_media(articles_data, media_files, output_path, format_type)
|
||||
else:
|
||||
if format_type == 'json':
|
||||
self.export_as_json(articles_data, output_path)
|
||||
elif format_type == 'csv':
|
||||
self.export_as_csv(articles_data, output_path)
|
||||
# 添加Word格式导出支持
|
||||
elif format_type == 'docx':
|
||||
self.export_as_word(articles_data, output_path)
|
||||
else:
|
||||
@@ -220,7 +230,6 @@ class Command(BaseCommand):
|
||||
'media_files'] else ''
|
||||
writer.writerow(article_data)
|
||||
zipf.writestr(data_filename, csv_buffer.getvalue())
|
||||
# 添加Word格式支持
|
||||
elif format_type == 'docx':
|
||||
# 创建Word文档并保存到ZIP
|
||||
try:
|
||||
|
||||
19
core/templates/admin/core/article/change_list.html
Normal file
19
core/templates/admin/core/article/change_list.html
Normal file
@@ -0,0 +1,19 @@
|
||||
{% extends "admin/change_list.html" %}
|
||||
{% load admin_urls %}
|
||||
|
||||
{% block object-tools %}
|
||||
{{ block.super }}
|
||||
<div style="margin-top: 10px;">
|
||||
<form method="post" action="{% url 'admin:run_crawler' %}" style="display: inline-block;">
|
||||
{% csrf_token %}
|
||||
<label for="website-select">选择网站:</label>
|
||||
<select name="website_name" id="website-select" required>
|
||||
<option value="">-- 请选择网站 --</option>
|
||||
<option value="crawl_xinhua">新华网</option>
|
||||
<option value="crawl_dongfangyancao">东方烟草报</option>
|
||||
<option value="crawl_articles">通用爬虫</option>
|
||||
</select>
|
||||
<input type="submit" value="执行爬虫" class="default" style="margin-left: 10px;" />
|
||||
</form>
|
||||
</div>
|
||||
{% endblock %}
|
||||
@@ -8,18 +8,17 @@
|
||||
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
|
||||
line-height: 1.6;
|
||||
color: #333;
|
||||
max-width: 1200px; /* 修改:增加页面最大宽度 */
|
||||
margin: 0 auto;
|
||||
padding: 20px;
|
||||
background-color: #f8f9fa;
|
||||
background-color: #f0f8ff; /* 统一背景色调 */
|
||||
}
|
||||
|
||||
.container {
|
||||
background: white;
|
||||
border-radius: 8px;
|
||||
box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
|
||||
padding: 30px;
|
||||
margin-bottom: 20px;
|
||||
box-shadow: 0 2px 5px rgba(0,0,0,0.05); /* 添加轻微阴影 */
|
||||
border-radius: 8px; /* 添加圆角 */
|
||||
}
|
||||
|
||||
h1 {
|
||||
@@ -32,7 +31,7 @@
|
||||
.filters {
|
||||
margin-bottom: 20px;
|
||||
padding: 15px;
|
||||
background-color: #f1f8ff;
|
||||
background-color: #e3f2fd; /* 统一滤镜背景色调 */
|
||||
border-radius: 5px;
|
||||
}
|
||||
|
||||
@@ -40,8 +39,8 @@
|
||||
display: inline-block;
|
||||
padding: 5px 10px;
|
||||
margin: 0 5px 5px 0;
|
||||
background-color: #e1e8ed;
|
||||
color: #333;
|
||||
background-color: #bbdefb; /* 统一链接背景色调 */
|
||||
color: #0d47a1;
|
||||
text-decoration: none;
|
||||
border-radius: 3px;
|
||||
}
|
||||
@@ -58,7 +57,7 @@
|
||||
|
||||
li {
|
||||
padding: 10px 0;
|
||||
border-bottom: 1px solid #ecf0f1;
|
||||
border-bottom: 1px solid #e0e0e0; /* 统一分隔线颜色 */
|
||||
}
|
||||
|
||||
li:last-child {
|
||||
@@ -66,17 +65,17 @@
|
||||
}
|
||||
|
||||
a {
|
||||
color: #3498db;
|
||||
color: #1976d2; /* 统一链接颜色 */
|
||||
text-decoration: none;
|
||||
}
|
||||
|
||||
a:hover {
|
||||
color: #2980b9;
|
||||
color: #0d47a1; /* 统一悬停颜色 */
|
||||
text-decoration: underline;
|
||||
}
|
||||
|
||||
.meta {
|
||||
color: #7f8c8d;
|
||||
color: #78909c; /* 统一元数据颜色 */
|
||||
font-size: 0.9em;
|
||||
}
|
||||
|
||||
@@ -122,16 +121,17 @@
|
||||
.search-form {
|
||||
margin-bottom: 20px;
|
||||
padding: 15px;
|
||||
background-color: #f1f8ff;
|
||||
background-color: #e3f2fd; /* 统一搜索框背景色调 */
|
||||
border-radius: 5px;
|
||||
}
|
||||
|
||||
.search-form input[type="text"] {
|
||||
padding: 8px 12px;
|
||||
border: 1px solid #ddd;
|
||||
border: 1px solid #bbdefb; /* 统一边框颜色 */
|
||||
border-radius: 4px;
|
||||
width: 300px;
|
||||
margin-right: 10px;
|
||||
background-color: #fff;
|
||||
}
|
||||
|
||||
.search-form input[type="submit"] {
|
||||
@@ -148,19 +148,178 @@
|
||||
}
|
||||
|
||||
.search-info {
|
||||
color: #7f8c8d;
|
||||
color: #78909c; /* 统一搜索信息颜色 */
|
||||
font-size: 0.9em;
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
|
||||
/* 新增:左侧筛选栏样式 */
|
||||
.content-wrapper {
|
||||
display: flex;
|
||||
gap: 20px;
|
||||
}
|
||||
|
||||
.sidebar {
|
||||
flex: 0 0 200px;
|
||||
background-color: #e3f2fd; /* 统一边栏背景色调 */
|
||||
border-radius: 5px;
|
||||
padding: 15px;
|
||||
}
|
||||
|
||||
.main-content {
|
||||
flex: 1;
|
||||
}
|
||||
|
||||
.sidebar .filters {
|
||||
margin-bottom: 20px;
|
||||
padding: 0;
|
||||
background-color: transparent;
|
||||
}
|
||||
|
||||
.sidebar .filters strong {
|
||||
display: block;
|
||||
margin-bottom: 10px;
|
||||
color: #2c3e50;
|
||||
}
|
||||
|
||||
.sidebar .filters a {
|
||||
display: block;
|
||||
padding: 8px 10px;
|
||||
margin: 0 0 5px 0;
|
||||
background-color: #bbdefb; /* 统一边栏链接背景色调 */
|
||||
color: #0d47a1;
|
||||
text-decoration: none;
|
||||
border-radius: 3px;
|
||||
}
|
||||
|
||||
.sidebar .filters a.active {
|
||||
background-color: #3498db;
|
||||
color: white;
|
||||
}
|
||||
|
||||
/* 新增:导出功能样式 */
|
||||
.export-section {
|
||||
margin-bottom: 20px;
|
||||
padding: 15px;
|
||||
background-color: #e8f5e9; /* 统一导出区域背景色调 */
|
||||
border-radius: 5px;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.export-btn {
|
||||
padding: 10px 20px;
|
||||
background-color: #4caf50; /* 统一按钮背景色调 */
|
||||
color: white;
|
||||
border: none;
|
||||
border-radius: 4px;
|
||||
cursor: pointer;
|
||||
font-size: 16px;
|
||||
margin: 0 5px;
|
||||
}
|
||||
|
||||
.export-btn:hover {
|
||||
background-color: #388e3c; /* 统一按钮悬停色调 */
|
||||
}
|
||||
|
||||
.export-btn:disabled {
|
||||
background-color: #9e9e9e; /* 统一禁用按钮色调 */
|
||||
cursor: not-allowed;
|
||||
}
|
||||
|
||||
.article-checkbox {
|
||||
margin-right: 10px;
|
||||
}
|
||||
|
||||
/* 新增:爬虫控制按钮样式 */
|
||||
.crawler-control {
|
||||
margin-bottom: 20px;
|
||||
padding: 15px;
|
||||
background-color: #fff3e0; /* 统一爬虫控制区域背景色调 */
|
||||
border-radius: 5px;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.crawler-btn {
|
||||
padding: 10px 20px;
|
||||
background-color: #ff9800; /* 统一爬虫按钮背景色调 */
|
||||
color: white;
|
||||
border: none;
|
||||
border-radius: 4px;
|
||||
cursor: pointer;
|
||||
font-size: 16px;
|
||||
margin: 0 5px;
|
||||
}
|
||||
|
||||
.crawler-btn:hover {
|
||||
background-color: #f57c00; /* 统一爬虫按钮悬停色调 */
|
||||
}
|
||||
|
||||
.crawler-btn:disabled {
|
||||
background-color: #9e9e9e; /* 统一禁用爬虫按钮色调 */
|
||||
cursor: not-allowed;
|
||||
}
|
||||
|
||||
.crawler-result {
|
||||
margin-top: 10px;
|
||||
padding: 10px;
|
||||
border-radius: 4px;
|
||||
display: none;
|
||||
}
|
||||
|
||||
.crawler-result.success {
|
||||
background-color: #e8f5e9;
|
||||
color: #2e7d32;
|
||||
}
|
||||
|
||||
.crawler-result.error {
|
||||
background-color: #ffebee;
|
||||
color: #c62828;
|
||||
}
|
||||
|
||||
/* 新增:进度条样式 */
|
||||
.progress-container {
|
||||
margin-top: 10px;
|
||||
display: none;
|
||||
}
|
||||
|
||||
.progress-bar {
|
||||
width: 100%;
|
||||
height: 20px;
|
||||
background-color: #e0e0e0;
|
||||
border-radius: 10px;
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
.progress-fill {
|
||||
height: 100%;
|
||||
background-color: #4caf50;
|
||||
width: 0%;
|
||||
transition: width 0.3s ease;
|
||||
}
|
||||
|
||||
.progress-text {
|
||||
margin-top: 5px;
|
||||
font-size: 14px;
|
||||
color: #666;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<h1>绿色课堂文章列表</h1>
|
||||
|
||||
<!-- 新增:返回首页链接 -->
|
||||
<div style="margin-bottom: 20px;">
|
||||
<a href="{% url 'article_list' %}" style="color: #3498db; text-decoration: none;">← 返回首页</a>
|
||||
<!-- 新增:爬虫控制按钮 -->
|
||||
<div class="crawler-control">
|
||||
<button id="runCrawlerBtn" class="crawler-btn" data-website="www.news.cn">执行新华网爬虫</button>
|
||||
<button id="runDongfangyancaoCrawlerBtn" class="crawler-btn" data-website="东方烟草报">执行东方烟草报爬虫</button>
|
||||
<button id="pauseCrawlerBtn" class="crawler-btn" disabled>暂停爬虫</button>
|
||||
<div id="crawlerProgress" class="progress-container">
|
||||
<div class="progress-bar">
|
||||
<div class="progress-fill" id="progressFill"></div>
|
||||
</div>
|
||||
<div class="progress-text" id="progressText">准备中...</div>
|
||||
</div>
|
||||
<div id="crawlerResult" class="crawler-result"></div>
|
||||
</div>
|
||||
|
||||
<!-- 新增:搜索表单 -->
|
||||
@@ -174,6 +333,9 @@
|
||||
</form>
|
||||
</div>
|
||||
|
||||
<div class="content-wrapper">
|
||||
<!-- 左侧筛选栏 -->
|
||||
<div class="sidebar">
|
||||
<div class="filters">
|
||||
<strong>按网站筛选:</strong>
|
||||
<a href="{% url 'article_list' %}{% if search_query %}?q={{ search_query }}{% endif %}" {% if not selected_website %}class="active" {% endif %}>全部</a>
|
||||
@@ -181,7 +343,10 @@
|
||||
<a href="?website={{ website.id }}{% if search_query %}&q={{ search_query }}{% endif %}" {% if selected_website and selected_website.id == website.id %}class="active" {% endif %}>{{ website.name }}</a>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- 主内容区域 -->
|
||||
<div class="main-content">
|
||||
<!-- 新增:搜索结果信息 -->
|
||||
{% if search_query %}
|
||||
<div class="search-info">
|
||||
@@ -190,9 +355,20 @@
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
<!-- 新增:导出功能 -->
|
||||
<div class="export-section">
|
||||
<button id="selectAllBtn" class="export-btn">全选</button>
|
||||
<button id="deselectAllBtn" class="export-btn">取消全选</button>
|
||||
<button id="exportJsonBtn" class="export-btn" disabled>导出为JSON</button>
|
||||
<button id="exportCsvBtn" class="export-btn" disabled>导出为CSV</button>
|
||||
<!-- 新增:导出为ZIP包按钮 -->
|
||||
<button id="exportZipBtn" class="export-btn" disabled>导出为ZIP包</button>
|
||||
</div>
|
||||
|
||||
<ul>
|
||||
{% for article in page_obj %}
|
||||
<li>
|
||||
<input type="checkbox" class="article-checkbox" value="{{ article.id }}" id="article_{{ article.id }}">
|
||||
<a href="{% url 'article_detail' article.id %}">{{ article.title }}</a>
|
||||
<div class="meta">({{ article.website.name }} - {{ article.created_at|date:"Y-m-d" }})</div>
|
||||
</li>
|
||||
@@ -247,6 +423,372 @@
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
// 存储当前任务ID和检查状态的定时器
|
||||
let currentTaskId = null;
|
||||
let statusCheckInterval = null;
|
||||
|
||||
// 获取页面元素
|
||||
const runCrawlerBtn = document.getElementById('runCrawlerBtn');
|
||||
const runDongfangyancaoCrawlerBtn = document.getElementById('runDongfangyancaoCrawlerBtn');
|
||||
const pauseCrawlerBtn = document.getElementById('pauseCrawlerBtn');
|
||||
const progressContainer = document.getElementById('crawlerProgress');
|
||||
const progressFill = document.getElementById('progressFill');
|
||||
const progressText = document.getElementById('progressText');
|
||||
const resultDiv = document.getElementById('crawlerResult');
|
||||
|
||||
// 绑定爬虫按钮事件
|
||||
runCrawlerBtn.addEventListener('click', function() {
|
||||
runCrawler('www.news.cn', 'crawl_xinhua');
|
||||
});
|
||||
|
||||
runDongfangyancaoCrawlerBtn.addEventListener('click', function() {
|
||||
runCrawler('东方烟草报', 'crawl_dongfangyancao');
|
||||
});
|
||||
|
||||
// 暂停按钮事件
|
||||
pauseCrawlerBtn.addEventListener('click', function() {
|
||||
if (currentTaskId) {
|
||||
pauseCrawler(currentTaskId);
|
||||
}
|
||||
});
|
||||
|
||||
// 运行爬虫函数
|
||||
function runCrawler(websiteName, crawlerName) {
|
||||
// 禁用按钮,防止重复点击
|
||||
runCrawlerBtn.disabled = true;
|
||||
runDongfangyancaoCrawlerBtn.disabled = true;
|
||||
resultDiv.style.display = 'none';
|
||||
|
||||
// 显示进度区域
|
||||
progressContainer.style.display = 'block';
|
||||
updateProgress(0, '爬虫启动中...');
|
||||
|
||||
// 发送POST请求运行爬虫
|
||||
fetch('{% url "run_crawler" %}', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/x-www-form-urlencoded',
|
||||
'X-CSRFToken': '{{ csrf_token }}'
|
||||
},
|
||||
body: 'crawler_name=' + crawlerName
|
||||
})
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
if (data.status === 'success') {
|
||||
currentTaskId = data.task_id;
|
||||
pauseCrawlerBtn.disabled = false;
|
||||
// 启动轮询检查爬虫状态
|
||||
checkCrawlerStatus(currentTaskId);
|
||||
} else {
|
||||
// 显示错误信息
|
||||
resultDiv.style.display = 'block';
|
||||
resultDiv.className = 'crawler-result error';
|
||||
resultDiv.textContent = data.message;
|
||||
// 恢复按钮状态
|
||||
runCrawlerBtn.disabled = false;
|
||||
runDongfangyancaoCrawlerBtn.disabled = false;
|
||||
progressContainer.style.display = 'none';
|
||||
}
|
||||
})
|
||||
.catch(error => {
|
||||
// 显示错误信息
|
||||
resultDiv.style.display = 'block';
|
||||
resultDiv.className = 'crawler-result error';
|
||||
resultDiv.textContent = '请求失败: ' + error;
|
||||
// 恢复按钮状态
|
||||
runCrawlerBtn.disabled = false;
|
||||
runDongfangyancaoCrawlerBtn.disabled = false;
|
||||
progressContainer.style.display = 'none';
|
||||
});
|
||||
}
|
||||
|
||||
// 检查爬虫状态的函数
|
||||
function checkCrawlerStatus(taskId) {
|
||||
// 清除之前的定时器
|
||||
if (statusCheckInterval) {
|
||||
clearInterval(statusCheckInterval);
|
||||
}
|
||||
|
||||
// 设置新的定时器,每秒检查一次状态
|
||||
statusCheckInterval = setInterval(() => {
|
||||
fetch('{% url "crawler_status" %}', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/x-www-form-urlencoded',
|
||||
'X-CSRFToken': '{{ csrf_token }}'
|
||||
},
|
||||
body: 'task_id=' + taskId
|
||||
})
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
if (data.status === 'running') {
|
||||
// 更新进度信息(模拟进度)
|
||||
const elapsedTime = new Date() - new Date(data.start_time);
|
||||
const progress = Math.min(90, Math.floor(elapsedTime / 1000));
|
||||
updateProgress(progress, data.message);
|
||||
} else if (data.status === 'completed') {
|
||||
// 爬虫完成,显示结果
|
||||
clearInterval(statusCheckInterval);
|
||||
updateProgress(100, data.message);
|
||||
|
||||
// 恢复按钮状态
|
||||
runCrawlerBtn.disabled = false;
|
||||
runDongfangyancaoCrawlerBtn.disabled = false;
|
||||
pauseCrawlerBtn.disabled = true;
|
||||
|
||||
// 显示结果信息
|
||||
resultDiv.style.display = 'block';
|
||||
resultDiv.className = 'crawler-result success';
|
||||
resultDiv.textContent = data.message;
|
||||
|
||||
// 3秒后自动隐藏进度条
|
||||
setTimeout(() => {
|
||||
progressContainer.style.display = 'none';
|
||||
}, 3000);
|
||||
|
||||
// 自动刷新页面以显示新文章
|
||||
setTimeout(() => {
|
||||
location.reload();
|
||||
}, 2000);
|
||||
} else if (data.status === 'paused') {
|
||||
// 爬虫暂停
|
||||
clearInterval(statusCheckInterval);
|
||||
updateProgress(data.progress || 0, '爬虫已暂停');
|
||||
|
||||
// 恢复按钮状态
|
||||
runCrawlerBtn.disabled = false;
|
||||
runDongfangyancaoCrawlerBtn.disabled = false;
|
||||
pauseCrawlerBtn.disabled = true;
|
||||
|
||||
// 显示结果信息
|
||||
resultDiv.style.display = 'block';
|
||||
resultDiv.className = 'crawler-result success';
|
||||
resultDiv.textContent = '爬虫已暂停';
|
||||
} else if (data.status === 'error') {
|
||||
// 爬虫出错
|
||||
clearInterval(statusCheckInterval);
|
||||
progressContainer.style.display = 'none';
|
||||
resultDiv.style.display = 'block';
|
||||
resultDiv.className = 'crawler-result error';
|
||||
resultDiv.textContent = data.message;
|
||||
|
||||
// 恢复按钮状态
|
||||
runCrawlerBtn.disabled = false;
|
||||
runDongfangyancaoCrawlerBtn.disabled = false;
|
||||
pauseCrawlerBtn.disabled = true;
|
||||
}
|
||||
})
|
||||
.catch(error => {
|
||||
clearInterval(statusCheckInterval);
|
||||
progressContainer.style.display = 'none';
|
||||
resultDiv.style.display = 'block';
|
||||
resultDiv.className = 'crawler-result error';
|
||||
resultDiv.textContent = '检查状态失败: ' + error;
|
||||
|
||||
// 恢复按钮状态
|
||||
runCrawlerBtn.disabled = false;
|
||||
runDongfangyancaoCrawlerBtn.disabled = false;
|
||||
pauseCrawlerBtn.disabled = true;
|
||||
});
|
||||
}, 1000);
|
||||
}
|
||||
|
||||
// 更新进度条函数
|
||||
function updateProgress(percent, text) {
|
||||
progressFill.style.width = percent + '%';
|
||||
progressText.textContent = text;
|
||||
}
|
||||
|
||||
// 暂停爬虫函数
|
||||
function pauseCrawler(taskId) {
|
||||
fetch('{% url "pause_crawler" %}', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/x-www-form-urlencoded',
|
||||
'X-CSRFToken': '{{ csrf_token }}'
|
||||
},
|
||||
body: 'task_id=' + taskId
|
||||
})
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
if (data.status === 'success') {
|
||||
// 暂停成功,更新UI
|
||||
pauseCrawlerBtn.disabled = true;
|
||||
updateProgress(data.progress || 0, '正在暂停...');
|
||||
} else {
|
||||
// 显示错误信息
|
||||
resultDiv.style.display = 'block';
|
||||
resultDiv.className = 'crawler-result error';
|
||||
resultDiv.textContent = data.message;
|
||||
}
|
||||
})
|
||||
.catch(error => {
|
||||
// 显示错误信息
|
||||
resultDiv.style.display = 'block';
|
||||
resultDiv.className = 'crawler-result error';
|
||||
resultDiv.textContent = '暂停请求失败: ' + error;
|
||||
});
|
||||
}
|
||||
|
||||
// 导出功能相关JavaScript
|
||||
const checkboxes = document.querySelectorAll('.article-checkbox');
|
||||
const exportJsonBtn = document.getElementById('exportJsonBtn');
|
||||
const exportCsvBtn = document.getElementById('exportCsvBtn');
|
||||
const selectAllBtn = document.getElementById('selectAllBtn');
|
||||
const deselectAllBtn = document.getElementById('deselectAllBtn');
|
||||
// 新增:获取ZIP导出按钮元素
|
||||
const exportZipBtn = document.getElementById('exportZipBtn');
|
||||
|
||||
// 更新导出按钮状态
|
||||
function updateExportButtons() {
|
||||
const selectedCount = document.querySelectorAll('.article-checkbox:checked').length;
|
||||
exportJsonBtn.disabled = selectedCount === 0;
|
||||
exportCsvBtn.disabled = selectedCount === 0;
|
||||
exportZipBtn.disabled = selectedCount === 0; // 新增:更新ZIP导出按钮状态
|
||||
}
|
||||
|
||||
// 为所有复选框添加事件监听器
|
||||
checkboxes.forEach(checkbox => {
|
||||
checkbox.addEventListener('change', updateExportButtons);
|
||||
});
|
||||
|
||||
// 全选功能
|
||||
selectAllBtn.addEventListener('click', () => {
|
||||
checkboxes.forEach(checkbox => {
|
||||
checkbox.checked = true;
|
||||
});
|
||||
updateExportButtons();
|
||||
});
|
||||
|
||||
// 取消全选功能
|
||||
deselectAllBtn.addEventListener('click', () => {
|
||||
checkboxes.forEach(checkbox => {
|
||||
checkbox.checked = false;
|
||||
});
|
||||
updateExportButtons();
|
||||
});
|
||||
|
||||
// 导出为JSON功能
|
||||
exportJsonBtn.addEventListener('click', () => {
|
||||
const selectedArticles = Array.from(document.querySelectorAll('.article-checkbox:checked'))
|
||||
.map(checkbox => checkbox.value);
|
||||
|
||||
// 发送POST请求导出文章
|
||||
fetch('{% url "export_articles" %}', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
'X-CSRFToken': '{{ csrf_token }}'
|
||||
},
|
||||
body: JSON.stringify({
|
||||
article_ids: selectedArticles,
|
||||
format: 'json'
|
||||
})
|
||||
})
|
||||
.then(response => {
|
||||
if (response.ok) {
|
||||
return response.blob();
|
||||
}
|
||||
throw new Error('导出失败');
|
||||
})
|
||||
.then(blob => {
|
||||
const url = window.URL.createObjectURL(blob);
|
||||
const a = document.createElement('a');
|
||||
a.href = url;
|
||||
a.download = 'articles.json';
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
window.URL.revokeObjectURL(url);
|
||||
document.body.removeChild(a);
|
||||
})
|
||||
.catch(error => {
|
||||
alert('导出失败: ' + error);
|
||||
});
|
||||
});
|
||||
|
||||
// 导出为CSV功能
|
||||
exportCsvBtn.addEventListener('click', () => {
|
||||
const selectedArticles = Array.from(document.querySelectorAll('.article-checkbox:checked'))
|
||||
.map(checkbox => checkbox.value);
|
||||
|
||||
// 发送POST请求导出文章
|
||||
fetch('{% url "export_articles" %}', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
'X-CSRFToken': '{{ csrf_token }}'
|
||||
},
|
||||
body: JSON.stringify({
|
||||
article_ids: selectedArticles,
|
||||
format: 'csv'
|
||||
})
|
||||
})
|
||||
.then(response => {
|
||||
if (response.ok) {
|
||||
return response.blob();
|
||||
}
|
||||
throw new Error('导出失败');
|
||||
})
|
||||
.then(blob => {
|
||||
const url = window.URL.createObjectURL(blob);
|
||||
const a = document.createElement('a');
|
||||
a.href = url;
|
||||
a.download = 'articles.csv';
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
window.URL.revokeObjectURL(url);
|
||||
document.body.removeChild(a);
|
||||
})
|
||||
.catch(error => {
|
||||
alert('导出失败: ' + error);
|
||||
});
|
||||
});
|
||||
|
||||
// 新增:导出为ZIP包功能
|
||||
exportZipBtn.addEventListener('click', () => {
|
||||
const selectedArticles = Array.from(document.querySelectorAll('.article-checkbox:checked'))
|
||||
.map(checkbox => checkbox.value);
|
||||
|
||||
// 发送POST请求导出文章为ZIP包
|
||||
fetch('{% url "export_articles" %}', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
'X-CSRFToken': '{{ csrf_token }}'
|
||||
},
|
||||
body: JSON.stringify({
|
||||
article_ids: selectedArticles,
|
||||
format: 'zip' // 指定导出格式为ZIP
|
||||
})
|
||||
})
|
||||
.then(response => {
|
||||
if (response.ok) {
|
||||
return response.blob();
|
||||
}
|
||||
throw new Error('导出失败');
|
||||
})
|
||||
.then(blob => {
|
||||
const url = window.URL.createObjectURL(blob);
|
||||
const a = document.createElement('a');
|
||||
a.href = url;
|
||||
a.download = 'articles.zip';
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
window.URL.revokeObjectURL(url);
|
||||
document.body.removeChild(a);
|
||||
})
|
||||
.catch(error => {
|
||||
alert('导出失败: ' + error);
|
||||
});
|
||||
});
|
||||
|
||||
// 初始化导出按钮状态
|
||||
updateExportButtons();
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
13
core/urls.py
13
core/urls.py
@@ -1,10 +1,15 @@
|
||||
from django.urls import path
|
||||
from django.urls import path, include
|
||||
from . import views
|
||||
# 添加以下导入
|
||||
from django.contrib import admin
|
||||
|
||||
urlpatterns = [
|
||||
# 主页,文章列表
|
||||
path('', views.article_list, name='article_list'),
|
||||
# 文章详情
|
||||
path('article/<int:article_id>/', views.article_detail, name='article_detail'),
|
||||
# 后续可以加更多路径
|
||||
path('run-crawler/', views.run_crawler, name='run_crawler'),
|
||||
# 新增:检查爬虫状态的路由
|
||||
path('crawler-status/', views.crawler_status, name='crawler_status'),
|
||||
# 添加导出文章的路由
|
||||
path('export-articles/', views.export_articles, name='export_articles'),
|
||||
# 添加自定义管理后台的路由
|
||||
]
|
||||
@@ -1,4 +1,3 @@
|
||||
# core/utils.py
|
||||
import os
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
@@ -42,6 +41,12 @@ def download_media(url, save_dir):
|
||||
filename += '.png'
|
||||
elif 'image/gif' in content_type:
|
||||
filename += '.gif'
|
||||
elif 'video/mp4' in content_type:
|
||||
filename += '.mp4'
|
||||
elif 'video/avi' in content_type:
|
||||
filename += '.avi'
|
||||
elif 'video/quicktime' in content_type:
|
||||
filename += '.mov'
|
||||
else:
|
||||
filename += '.bin' # 默认二进制扩展名
|
||||
|
||||
@@ -61,6 +66,7 @@ def download_media(url, save_dir):
|
||||
|
||||
|
||||
def process_article(url, website):
|
||||
# 检查文章是否已存在,如果存在则跳过
|
||||
if Article.objects.filter(url=url).exists():
|
||||
print(f"文章已存在,跳过: {url}")
|
||||
return
|
||||
@@ -116,6 +122,8 @@ def process_article(url, website):
|
||||
return
|
||||
|
||||
imgs = content_tag.find_all("img")
|
||||
# 查找视频元素
|
||||
videos = content_tag.find_all("video")
|
||||
media_files = []
|
||||
|
||||
safe_title = "".join(c if c.isalnum() else "_" for c in title)[:50]
|
||||
@@ -134,8 +142,36 @@ def process_article(url, website):
|
||||
img["src"] = settings.MEDIA_URL + rel_path.replace("\\", "/")
|
||||
media_files.append(rel_path.replace("\\", "/"))
|
||||
|
||||
# 处理视频文件
|
||||
for video in videos:
|
||||
src = video.get("src")
|
||||
if not src:
|
||||
# 检查<source>标签
|
||||
source = video.find("source")
|
||||
if source:
|
||||
src = source.get("src")
|
||||
|
||||
if not src:
|
||||
continue
|
||||
|
||||
if not src.startswith("http"):
|
||||
src = urljoin(url, src)
|
||||
local_path = download_media(src, save_dir)
|
||||
if local_path:
|
||||
rel_path = os.path.relpath(local_path, settings.MEDIA_ROOT)
|
||||
# 更新视频src属性
|
||||
if video.get("src"):
|
||||
video["src"] = settings.MEDIA_URL + rel_path.replace("\\", "/")
|
||||
else:
|
||||
source = video.find("source")
|
||||
if source:
|
||||
source["src"] = settings.MEDIA_URL + rel_path.replace("\\", "/")
|
||||
media_files.append(rel_path.replace("\\", "/"))
|
||||
|
||||
content_html = str(content_tag)
|
||||
|
||||
try:
|
||||
# 使用try-except处理可能的数据库约束错误
|
||||
article = Article.objects.create(
|
||||
website=website,
|
||||
title=title,
|
||||
@@ -145,6 +181,12 @@ def process_article(url, website):
|
||||
media_files=media_files
|
||||
)
|
||||
print(f"已保存文章及图片:{title}")
|
||||
except Exception as e:
|
||||
# 处理重复URL或其他数据库错误
|
||||
if "UNIQUE constraint failed" in str(e) and "core_article.url" in str(e):
|
||||
print(f"文章URL重复,跳过保存: {url}")
|
||||
else:
|
||||
print(f"保存文章时出错: {url},错误:{e}")
|
||||
|
||||
|
||||
def is_valid_url(url, base_netloc):
|
||||
|
||||
340
core/views.py
340
core/views.py
@@ -1,6 +1,21 @@
|
||||
import uuid
|
||||
from django.shortcuts import render
|
||||
from django.core.paginator import Paginator
|
||||
from django.http import JsonResponse
|
||||
from django.views.decorators.http import require_http_methods
|
||||
from django.core.management import call_command
|
||||
from .models import Article, Website
|
||||
import threading
|
||||
from django.http import HttpResponse
|
||||
import json
|
||||
import csv
|
||||
from django.views.decorators.csrf import csrf_exempt
|
||||
from django.utils import timezone
|
||||
|
||||
|
||||
# 用于跟踪爬虫任务状态的全局字典
|
||||
crawler_tasks = {}
|
||||
|
||||
|
||||
def article_list(request):
|
||||
# 获取所有启用的网站
|
||||
@@ -18,7 +33,7 @@ def article_list(request):
|
||||
except Website.DoesNotExist:
|
||||
pass
|
||||
|
||||
# 新增:处理关键词搜索
|
||||
# 处理关键词搜索
|
||||
search_query = request.GET.get('q')
|
||||
if search_query:
|
||||
articles = articles.filter(title__icontains=search_query)
|
||||
@@ -35,10 +50,331 @@ def article_list(request):
|
||||
'page_obj': page_obj,
|
||||
'websites': websites,
|
||||
'selected_website': selected_website,
|
||||
# 新增:传递搜索关键词到模板
|
||||
'search_query': search_query
|
||||
})
|
||||
|
||||
|
||||
def article_detail(request, article_id):
|
||||
article = Article.objects.get(id=article_id)
|
||||
return render(request, 'core/article_detail.html', {'article': article})
|
||||
|
||||
|
||||
# 添加任务ID生成和状态跟踪
|
||||
@require_http_methods(["POST"])
|
||||
def run_crawler(request):
|
||||
"""
|
||||
从前台触发爬虫任务
|
||||
"""
|
||||
try:
|
||||
# 获取要执行的爬虫名称
|
||||
crawler_name = request.POST.get('crawler_name', '')
|
||||
if not crawler_name:
|
||||
return JsonResponse({'status': 'error', 'message': '爬虫名称不能为空'})
|
||||
|
||||
# 生成任务ID
|
||||
task_id = str(uuid.uuid4())
|
||||
|
||||
# 记录任务开始前的文章数量
|
||||
initial_count = Article.objects.count()
|
||||
|
||||
# 在后台线程中运行爬虫任务
|
||||
def run_spider():
|
||||
try:
|
||||
# 更新任务状态为运行中
|
||||
crawler_tasks[task_id] = {
|
||||
'status': 'running',
|
||||
'message': '爬虫正在运行...',
|
||||
'start_time': timezone.now(),
|
||||
'initial_count': initial_count
|
||||
}
|
||||
|
||||
# 根据爬虫名称调用相应的命令
|
||||
if crawler_name in ['crawl_xinhua', 'crawl_dongfangyancao']:
|
||||
call_command(crawler_name)
|
||||
else:
|
||||
# 如果是通用爬虫命令,使用crawl_articles
|
||||
call_command('crawl_articles', crawler_name)
|
||||
|
||||
# 计算新增文章数量
|
||||
final_count = Article.objects.count()
|
||||
added_count = final_count - initial_count
|
||||
|
||||
# 更新任务状态为完成
|
||||
crawler_tasks[task_id] = {
|
||||
'status': 'completed',
|
||||
'message': f'爬虫已完成,新增 {added_count} 篇文章',
|
||||
'added_count': added_count,
|
||||
'end_time': timezone.now()
|
||||
}
|
||||
except Exception as e:
|
||||
# 修改:改进错误处理,提供更友好的错误信息
|
||||
error_msg = str(e)
|
||||
if "UNIQUE constraint failed" in error_msg and "core_article.url" in error_msg:
|
||||
error_msg = "检测到重复文章URL,已跳过重复项"
|
||||
else:
|
||||
print(f"爬虫执行出错: {e}")
|
||||
|
||||
# 计算实际新增文章数量(即使有错误也统计)
|
||||
final_count = Article.objects.count()
|
||||
added_count = final_count - initial_count
|
||||
|
||||
# 更新任务状态为完成(即使有部分错误)
|
||||
crawler_tasks[task_id] = {
|
||||
'status': 'completed',
|
||||
'message': f'爬虫已完成,新增 {added_count} 篇文章。{error_msg}',
|
||||
'added_count': added_count,
|
||||
'end_time': timezone.now(),
|
||||
'error': error_msg
|
||||
}
|
||||
|
||||
# 启动后台线程执行爬虫
|
||||
thread = threading.Thread(target=run_spider)
|
||||
thread.daemon = True
|
||||
thread.start()
|
||||
|
||||
return JsonResponse({'status': 'success', 'message': f'爬虫 {crawler_name} 已启动', 'task_id': task_id})
|
||||
except Exception as e:
|
||||
return JsonResponse({'status': 'error', 'message': str(e)})
|
||||
|
||||
|
||||
# 检查爬虫状态的视图
|
||||
@require_http_methods(["POST"])
|
||||
def crawler_status(request):
|
||||
"""
|
||||
检查爬虫任务状态
|
||||
"""
|
||||
try:
|
||||
task_id = request.POST.get('task_id', '')
|
||||
if not task_id:
|
||||
return JsonResponse({'status': 'error', 'message': '任务ID不能为空'})
|
||||
|
||||
# 获取任务状态
|
||||
task_info = crawler_tasks.get(task_id)
|
||||
if not task_info:
|
||||
return JsonResponse({'status': 'error', 'message': '未找到任务'})
|
||||
|
||||
return JsonResponse(task_info)
|
||||
except Exception as e:
|
||||
return JsonResponse({'status': 'error', 'message': str(e)})
|
||||
|
||||
|
||||
# 新增:文章导出视图
|
||||
@csrf_exempt
|
||||
@require_http_methods(["POST"])
|
||||
def export_articles(request):
|
||||
try:
|
||||
# 解析请求数据
|
||||
data = json.loads(request.body)
|
||||
article_ids = data.get('article_ids', [])
|
||||
format_type = data.get('format', 'json')
|
||||
|
||||
# 获取选中的文章
|
||||
articles = Article.objects.filter(id__in=article_ids)
|
||||
|
||||
if not articles.exists():
|
||||
return HttpResponse('没有选中文章', status=400)
|
||||
|
||||
# 根据格式类型导出
|
||||
if format_type == 'json':
|
||||
# 准备JSON数据
|
||||
articles_data = []
|
||||
for article in articles:
|
||||
articles_data.append({
|
||||
'id': article.id,
|
||||
'title': article.title,
|
||||
'website': article.website.name,
|
||||
'url': article.url,
|
||||
'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None,
|
||||
'content': article.content,
|
||||
'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
'media_files': article.media_files
|
||||
})
|
||||
|
||||
# 创建JSON响应
|
||||
response = HttpResponse(
|
||||
json.dumps(articles_data, ensure_ascii=False, indent=2),
|
||||
content_type='application/json'
|
||||
)
|
||||
response['Content-Disposition'] = 'attachment; filename="articles.json"'
|
||||
return response
|
||||
|
||||
elif format_type == 'csv':
|
||||
# 创建CSV响应
|
||||
response = HttpResponse(content_type='text/csv')
|
||||
response['Content-Disposition'] = 'attachment; filename="articles.csv"'
|
||||
|
||||
# 创建CSV写入器
|
||||
writer = csv.writer(response)
|
||||
writer.writerow(['ID', '标题', '网站', 'URL', '发布时间', '内容', '创建时间', '媒体文件'])
|
||||
|
||||
# 写入文章数据
|
||||
for article in articles:
|
||||
writer.writerow([
|
||||
article.id,
|
||||
article.title,
|
||||
article.website.name,
|
||||
article.url,
|
||||
article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else '',
|
||||
article.content,
|
||||
article.created_at.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
';'.join(article.media_files) if article.media_files else ''
|
||||
])
|
||||
|
||||
return response
|
||||
|
||||
# 新增:支持ZIP格式导出
|
||||
elif format_type == 'zip':
|
||||
import zipfile
|
||||
from io import BytesIO
|
||||
from django.conf import settings
|
||||
import os
|
||||
|
||||
# 创建内存中的ZIP文件
|
||||
zip_buffer = BytesIO()
|
||||
|
||||
with zipfile.ZipFile(zip_buffer, 'w') as zip_file:
|
||||
# 为每篇文章创建Word文档并添加到ZIP文件中
|
||||
for article in articles:
|
||||
# 为每篇文章创建单独的文件夹
|
||||
article_folder = f"article_{article.id}_{article.title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('*', '_').replace('?', '_').replace('"', '_').replace('<', '_').replace('>', '_').replace('|', '_')}"
|
||||
|
||||
# 创建文章数据
|
||||
article_data = {
|
||||
'id': article.id,
|
||||
'title': article.title,
|
||||
'website': article.website.name,
|
||||
'url': article.url,
|
||||
'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None,
|
||||
'content': article.content,
|
||||
'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
'media_files': article.media_files
|
||||
}
|
||||
|
||||
# 将文章数据保存为Word文件并添加到ZIP
|
||||
try:
|
||||
from docx import Document
|
||||
from docx.shared import Inches
|
||||
from io import BytesIO
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
|
||||
# 创建Word文档
|
||||
doc = Document()
|
||||
doc.add_heading(article.title, 0)
|
||||
|
||||
# 添加文章元数据
|
||||
doc.add_paragraph(f"网站: {article.website.name}")
|
||||
doc.add_paragraph(f"URL: {article.url}")
|
||||
doc.add_paragraph(f"发布时间: {article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else 'N/A'}")
|
||||
doc.add_paragraph(f"创建时间: {article.created_at.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
|
||||
# 添加文章内容
|
||||
doc.add_heading('内容', level=1)
|
||||
|
||||
# 处理HTML内容
|
||||
soup = BeautifulSoup(article.content, 'html.parser')
|
||||
|
||||
# 处理内容中的图片
|
||||
for img in soup.find_all('img'):
|
||||
src = img.get('src', '')
|
||||
if src:
|
||||
try:
|
||||
# 构建完整的图片路径
|
||||
if src.startswith('http'):
|
||||
# 网络图片
|
||||
response = requests.get(src, timeout=10)
|
||||
image_stream = BytesIO(response.content)
|
||||
doc.add_picture(image_stream, width=Inches(4.0))
|
||||
else:
|
||||
# 本地图片
|
||||
full_path = os.path.join(settings.MEDIA_ROOT, src.lstrip('/'))
|
||||
if os.path.exists(full_path):
|
||||
doc.add_picture(full_path, width=Inches(4.0))
|
||||
except Exception as e:
|
||||
# 如果添加图片失败,添加图片URL作为文本
|
||||
doc.add_paragraph(f"[图片: {src}]")
|
||||
|
||||
# 移除原始img标签
|
||||
img.decompose()
|
||||
|
||||
content_text = soup.get_text()
|
||||
doc.add_paragraph(content_text)
|
||||
|
||||
# 添加媒体文件信息
|
||||
if article.media_files:
|
||||
doc.add_heading('媒体文件', level=1)
|
||||
for media_file in article.media_files:
|
||||
try:
|
||||
full_path = os.path.join(settings.MEDIA_ROOT, media_file)
|
||||
if os.path.exists(full_path):
|
||||
# 检查文件扩展名以确定处理方式
|
||||
file_extension = os.path.splitext(media_file)[1].lower()
|
||||
|
||||
# 图片文件处理
|
||||
if file_extension in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']:
|
||||
doc.add_picture(full_path, width=Inches(4.0))
|
||||
# 视频文件处理
|
||||
elif file_extension in ['.mp4', '.avi', '.mov', '.wmv', '.flv', '.webm']:
|
||||
doc.add_paragraph(f"[视频文件: {media_file}]")
|
||||
# 其他文件类型
|
||||
else:
|
||||
doc.add_paragraph(f"[文件: {media_file}]")
|
||||
else:
|
||||
# 如果是URL格式的媒体文件
|
||||
if media_file.startswith('http'):
|
||||
response = requests.get(media_file, timeout=10)
|
||||
file_extension = os.path.splitext(media_file)[1].lower()
|
||||
|
||||
# 图片文件处理
|
||||
if file_extension in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']:
|
||||
image_stream = BytesIO(response.content)
|
||||
doc.add_picture(image_stream, width=Inches(4.0))
|
||||
else:
|
||||
doc.add_paragraph(f"[文件: {media_file}]")
|
||||
else:
|
||||
doc.add_paragraph(media_file)
|
||||
except Exception as e:
|
||||
doc.add_paragraph(media_file)
|
||||
|
||||
# 保存Word文档到内存
|
||||
doc_buffer = BytesIO()
|
||||
doc.save(doc_buffer)
|
||||
doc_buffer.seek(0)
|
||||
|
||||
# 将Word文档添加到ZIP包
|
||||
zip_file.writestr(os.path.join(article_folder, f'{article.title.replace("/", "_")}.docx'), doc_buffer.read())
|
||||
|
||||
except ImportError:
|
||||
# 如果没有安装python-docx库,回退到JSON格式
|
||||
json_data = json.dumps(article_data, ensure_ascii=False, indent=2)
|
||||
zip_file.writestr(os.path.join(article_folder, f'{article.title.replace("/", "_")}.json'), json_data)
|
||||
|
||||
# 添加媒体文件到ZIP包
|
||||
if article.media_files:
|
||||
for media_file in article.media_files:
|
||||
try:
|
||||
full_path = os.path.join(settings.MEDIA_ROOT, media_file)
|
||||
if os.path.exists(full_path):
|
||||
# 添加文件到ZIP包
|
||||
zip_file.write(full_path, os.path.join(article_folder, 'media', media_file))
|
||||
else:
|
||||
# 如果是URL格式的媒体文件
|
||||
if media_file.startswith('http'):
|
||||
import requests
|
||||
response = requests.get(media_file, timeout=10)
|
||||
zip_file.writestr(os.path.join(article_folder, 'media', os.path.basename(media_file)), response.content)
|
||||
except Exception as e:
|
||||
# 如果添加媒体文件失败,继续处理其他文件
|
||||
pass
|
||||
|
||||
# 创建HttpResponse
|
||||
zip_buffer.seek(0)
|
||||
response = HttpResponse(zip_buffer.getvalue(), content_type='application/zip')
|
||||
response['Content-Disposition'] = 'attachment; filename=articles_export.zip'
|
||||
return response
|
||||
|
||||
else:
|
||||
return HttpResponse('不支持的格式', status=400)
|
||||
|
||||
except Exception as e:
|
||||
return HttpResponse(f'导出失败: {str(e)}', status=500)
|
||||
@@ -4,12 +4,10 @@ from django.contrib import admin
|
||||
from django.urls import path, include
|
||||
|
||||
# 需要导入自定义的管理站点实例
|
||||
from core.admin import news_cn_admin, dongfangyancao_admin
|
||||
|
||||
|
||||
urlpatterns = [
|
||||
path('admin/', admin.site.urls),
|
||||
path('news_cn_admin/', news_cn_admin.urls),
|
||||
path('dongfangyancao_admin/', dongfangyancao_admin.urls),
|
||||
# 以后前台访问放 core app 的 urls
|
||||
path('', include('core.urls')),
|
||||
]
|
||||
|
||||
Reference in New Issue
Block a user