This commit is contained in:
2025-08-13 21:35:11 +08:00
parent c618528a0a
commit 31d0525cd0
10 changed files with 243 additions and 897 deletions

View File

@@ -1,9 +1,6 @@
from django.contrib import admin
from django.contrib.admin import AdminSite
from .models import Website, Article
# 添加actions相关的导入
from django.contrib import messages
from django.http import HttpResponseRedirect
# 添加导出功能所需导入
import csv
from django.http import HttpResponse
@@ -12,17 +9,7 @@ import json
from django.shortcuts import render, redirect
from django.urls import path
from django.contrib import admin
from django.http import JsonResponse
from django.views.decorators.http import require_http_methods
from django.core.management import call_command
import threading
import uuid
from django.utils import timezone
# 创建自定义管理站点
# 实例化管理站点
# 添加运行爬虫的视图函数
def run_crawler_view(request):
@@ -36,21 +23,24 @@ def run_crawler_view(request):
return redirect('admin:core_article_changelist')
try:
# 根据网站名称确定要执行的爬虫命令
if website_name == 'crawl_xinhua':
crawler_name = 'crawl_xinhua'
elif website_name == 'crawl_dongfangyancao':
crawler_name = 'crawl_dongfangyancao'
elif website_name == 'crawl_articles':
crawler_name = 'crawl_articles'
else:
# 对于其他网站,使用通用爬虫命令
crawler_name = 'crawl_articles'
# 动态获取网站对象
website = Website.objects.get(name=website_name)
# 运行爬虫命令不传递website_name作为参数
call_command(crawler_name)
# 根据网站对象确定要执行的爬虫命令
# 移除默认的通用爬虫,每个网站必须配置自己的爬虫命令
crawler_name = getattr(website, 'crawler_command', None)
messages.success(request, f'成功执行爬虫: {crawler_name}')
# 如果网站没有配置爬虫命令,则报错
if not crawler_name:
messages.error(request, f'网站 {website_name} 未配置爬虫命令')
return redirect('admin:core_article_changelist')
# 运行爬虫命令,传递网站名称
call_command(crawler_name, website_name)
messages.success(request, f'成功执行爬虫: {website_name}')
except Website.DoesNotExist:
messages.error(request, f'网站不存在: {website_name}')
except Exception as e:
messages.error(request, f'执行爬虫失败: {str(e)}')
@@ -71,6 +61,10 @@ class ArticleAdmin(admin.ModelAdmin):
actions = ['delete_selected_articles', 'export_as_csv', 'export_as_json',
'export_as_word', 'export_with_media']
def get_websites(self):
"""获取所有启用的网站"""
return Website.objects.filter(enabled=True)
# 重写get_urls方法添加自定义URL
def get_urls(self):
urls = super().get_urls()

View File

@@ -8,7 +8,7 @@ class Command(BaseCommand):
def handle(self, *args, **kwargs):
website, created = Website.objects.get_or_create(
name="www.gov.cn",
name="中国政府网",
defaults={
'article_list_url': 'https://www.gov.cn/',
'article_selector': 'a'

View File

@@ -0,0 +1,26 @@
from django.core.management.base import BaseCommand
from core.models import Website
from core.utils import full_site_crawler
class Command(BaseCommand):
help = "全站递归爬取 人民日报 https://www.peopleapp.com"
def handle(self, *args, **kwargs):
website, created = Website.objects.get_or_create(
name="人民日报",
defaults={
'article_list_url': 'https://www.peopleapp.com/home',
'article_selector': 'a',
'base_url': 'https://www.peopleapp.com'
}
)
# 确保更新已存在的网站对象的base_url
if not created and not website.base_url:
website.base_url = 'https://www.peopleapp.com'
website.save()
start_url = "https://www.peopleapp.com/home"
self.stdout.write(f"开始全站爬取: {start_url}")
full_site_crawler(start_url, website, max_pages=500)
self.stdout.write("爬取完成")

View File

@@ -8,7 +8,7 @@ class Command(BaseCommand):
def handle(self, *args, **kwargs):
website, created = Website.objects.get_or_create(
name="www.news.cn",
name="新华网",
defaults={
'article_list_url': 'https://www.news.cn/',
'article_selector': 'a'

View File

@@ -4,7 +4,6 @@ import json
import csv
import os
from django.conf import settings
from django.core.files.storage import default_storage
import zipfile
from django.utils import timezone

View File

@@ -9,9 +9,9 @@
<label for="website-select">选择网站:</label>
<select name="website_name" id="website-select" required>
<option value="">-- 请选择网站 --</option>
<option value="crawl_xinhua">新华网</option>
<option value="crawl_dongfangyancao">东方烟草报</option>
<option value="crawl_articles">通用爬虫</option>
{% for website in cl.model_admin.get_websites %}
<option value="{{ website.name }}">{{ website.name }}</option>
{% endfor %}
</select>
<input type="submit" value="执行爬虫" class="default" style="margin-left: 10px;"/>
</form>

View File

@@ -2,7 +2,8 @@
<html lang="zh">
<head>
<meta charset="UTF-8"/>
<title>绿色课堂文章列表</title>
<meta name="viewport" content="width=device-width, initial-scale=1.0"/>
<title>{{ article.title }} - 绿色课堂</title>
<style>
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
@@ -10,15 +11,16 @@
color: #333;
margin: 0 auto;
padding: 20px;
background-color: #f0f8ff; /* 统一背景色调 */
background-color: #f0f8ff;
max-width: 800px;
}
.container {
background: white;
padding: 30px;
margin-bottom: 20px;
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.05); /* 添加轻微阴影 */
border-radius: 8px; /* 添加圆角 */
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.05);
border-radius: 8px;
}
h1 {
@@ -28,765 +30,59 @@
margin-top: 0;
}
.filters {
.meta {
color: #78909c;
font-size: 0.9em;
margin-bottom: 20px;
padding: 15px;
background-color: #e3f2fd; /* 统一滤镜背景色调 */
border-radius: 5px;
}
.filters a {
.content {
margin-top: 20px;
}
.content img {
max-width: 100%;
height: auto;
}
.back-link {
display: inline-block;
padding: 5px 10px;
margin: 0 5px 5px 0;
background-color: #bbdefb; /* 统一链接背景色调 */
margin-bottom: 20px;
color: #1976d2;
text-decoration: none;
}
.back-link:hover {
color: #0d47a1;
text-decoration: none;
border-radius: 3px;
}
.filters a.active {
background-color: #3498db;
color: white;
}
ul {
list-style: none;
padding: 0;
}
li {
padding: 10px 0;
border-bottom: 1px solid #e0e0e0; /* 统一分隔线颜色 */
}
li:last-child {
border-bottom: none;
}
a {
color: #1976d2; /* 统一链接颜色 */
text-decoration: none;
}
a:hover {
color: #0d47a1; /* 统一悬停颜色 */
text-decoration: underline;
}
.meta {
color: #78909c; /* 统一元数据颜色 */
font-size: 0.9em;
}
.pagination {
margin-top: 30px;
text-align: center;
padding: 20px 0;
}
.pagination a {
display: inline-block;
padding: 8px 16px;
background-color: #3498db;
color: white;
text-decoration: none;
border-radius: 4px;
margin: 0 2px; /* 修改:调整页码间距 */
}
.pagination a:hover {
background-color: #2980b9;
}
.pagination span {
margin: 0 10px;
color: #7f8c8d;
}
/* 新增:当前页码样式 */
.pagination .current {
background-color: #2980b9;
cursor: default;
}
/* 新增:省略号样式 */
.pagination .ellipsis {
display: inline-block;
padding: 8px 4px;
color: #7f8c8d;
}
/* 新增:搜索框样式 */
.search-form {
margin-bottom: 20px;
padding: 15px;
background-color: #e3f2fd; /* 统一搜索框背景色调 */
border-radius: 5px;
}
.search-form input[type="text"] {
padding: 8px 12px;
border: 1px solid #bbdefb; /* 统一边框颜色 */
border-radius: 4px;
width: 300px;
margin-right: 10px;
background-color: #fff;
}
.search-form input[type="submit"] {
padding: 8px 16px;
background-color: #3498db;
color: white;
border: none;
border-radius: 4px;
cursor: pointer;
}
.search-form input[type="submit"]:hover {
background-color: #2980b9;
}
.search-info {
color: #78909c; /* 统一搜索信息颜色 */
font-size: 0.9em;
margin-bottom: 10px;
}
/* 新增:左侧筛选栏样式 */
.content-wrapper {
display: flex;
gap: 20px;
}
.sidebar {
flex: 0 0 200px;
background-color: #e3f2fd; /* 统一边栏背景色调 */
border-radius: 5px;
padding: 15px;
}
.main-content {
flex: 1;
}
.sidebar .filters {
margin-bottom: 20px;
padding: 0;
background-color: transparent;
}
.sidebar .filters strong {
display: block;
margin-bottom: 10px;
color: #2c3e50;
}
.sidebar .filters a {
display: block;
padding: 8px 10px;
margin: 0 0 5px 0;
background-color: #bbdefb; /* 统一边栏链接背景色调 */
color: #0d47a1;
text-decoration: none;
border-radius: 3px;
}
.sidebar .filters a.active {
background-color: #3498db;
color: white;
}
/* 新增:导出功能样式 */
.export-section {
margin-bottom: 20px;
padding: 15px;
background-color: #e8f5e9; /* 统一导出区域背景色调 */
border-radius: 5px;
text-align: center;
}
.export-btn {
padding: 10px 20px;
background-color: #4caf50; /* 统一按钮背景色调 */
color: white;
border: none;
border-radius: 4px;
cursor: pointer;
font-size: 16px;
margin: 0 5px;
}
.export-btn:hover {
background-color: #388e3c; /* 统一按钮悬停色调 */
}
.export-btn:disabled {
background-color: #9e9e9e; /* 统一禁用按钮色调 */
cursor: not-allowed;
}
.article-checkbox {
margin-right: 10px;
}
/* 新增:爬虫控制按钮样式 */
.crawler-control {
margin-bottom: 20px;
padding: 15px;
background-color: #fff3e0; /* 统一爬虫控制区域背景色调 */
border-radius: 5px;
text-align: center;
}
.crawler-btn {
padding: 10px 20px;
background-color: #ff9800; /* 统一爬虫按钮背景色调 */
color: white;
border: none;
border-radius: 4px;
cursor: pointer;
font-size: 16px;
margin: 0 5px;
}
.crawler-btn:hover {
background-color: #f57c00; /* 统一爬虫按钮悬停色调 */
}
.crawler-btn:disabled {
background-color: #9e9e9e; /* 统一禁用爬虫按钮色调 */
cursor: not-allowed;
}
.crawler-result {
margin-top: 10px;
@media (max-width: 600px) {
body {
padding: 10px;
border-radius: 4px;
display: none;
}
.crawler-result.success {
background-color: #e8f5e9;
color: #2e7d32;
.container {
padding: 15px;
}
.crawler-result.error {
background-color: #ffebee;
color: #c62828;
}
/* 新增:进度条样式 */
.progress-container {
margin-top: 10px;
display: none;
}
.progress-bar {
width: 100%;
height: 20px;
background-color: #e0e0e0;
border-radius: 10px;
overflow: hidden;
}
.progress-fill {
height: 100%;
background-color: #4caf50;
width: 0%;
transition: width 0.3s ease;
}
.progress-text {
margin-top: 5px;
font-size: 14px;
color: #666;
}
</style>
</head>
<body>
<div class="container">
<h1>绿色课堂文章列表</h1>
<div class="container">
<a href="{% url 'article_list' %}" class="back-link">&laquo; 返回文章列表</a>
<!-- 新增:搜索表单 -->
<div class="search-form">
<form method="get">
<input type="text" name="q" placeholder="输入关键词搜索文章..." value="{{ search_query }}">
{% if selected_website %}
<input type="hidden" name="website" value="{{ selected_website.id }}">
{% endif %}
<input type="submit" value="搜索">
</form>
<h1>{{ article.title }}</h1>
<div class="meta">
网站: {{ article.website.name }} |
发布时间: {{ article.pub_date|date:"Y-m-d H:i" }} |
创建时间: {{ article.created_at|date:"Y-m-d H:i" }}
</div>
<div class="content-wrapper">
<!-- 左侧筛选栏 -->
<div class="sidebar">
<div class="filters">
<strong>按网站筛选:</strong>
<a href="{% url 'article_list' %}{% if search_query %}?q={{ search_query }}{% endif %}"
{% if not selected_website %}class="active" {% endif %}>全部</a>
{% for website in websites %}
<a href="?website={{ website.id }}{% if search_query %}&q={{ search_query }}{% endif %}"
{% if selected_website and selected_website.id == website.id %}class="active" {% endif %}>{{ website.name }}</a>
{% endfor %}
<div class="content">
{{ article.content|safe }}
</div>
</div>
<!-- 主内容区域 -->
<div class="main-content">
<!-- 新增:搜索结果信息 -->
{% if search_query %}
<div class="search-info">
搜索 "{{ search_query }}" 找到 {{ page_obj.paginator.count }} 篇文章
<a href="{% if selected_website %}?website={{ selected_website.id }}{% else %}{% url 'article_list' %}{% endif %}">清除搜索</a>
</div>
{% endif %}
<!-- 新增:导出功能 -->
<div class="export-section">
<button id="selectAllBtn" class="export-btn">全选</button>
<button id="deselectAllBtn" class="export-btn">取消全选</button>
<button id="exportJsonBtn" class="export-btn" disabled>导出为JSON</button>
<button id="exportCsvBtn" class="export-btn" disabled>导出为CSV</button>
<!-- 新增:导出为ZIP包按钮 -->
<button id="exportZipBtn" class="export-btn" disabled>导出为ZIP包</button>
</div>
<ul>
{% for article in page_obj %}
<li>
<input type="checkbox" class="article-checkbox" value="{{ article.id }}"
id="article_{{ article.id }}">
<a href="{% url 'article_detail' article.id %}">{{ article.title }}</a>
<div class="meta">({{ article.website.name }} - {{ article.created_at|date:"Y-m-d" }})</div>
</li>
{% empty %}
<li>暂无文章</li>
{% endfor %}
</ul>
<div class="pagination">
{% if page_obj.has_previous %}
{% if selected_website %}
<a href="?website=
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page=1">&laquo;
首页</a>
<a href="?website=
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.previous_page_number }}">上一页</a>
{% else %}
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page=1">&laquo; 首页</a>
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.previous_page_number }}">上一页</a>
{% endif %}
{% endif %}
<span>第 {{ page_obj.number }} 页,共 {{ page_obj.paginator.num_pages }} 页</span>
<!-- 修改:优化页码显示逻辑 -->
{% with page_obj.paginator as paginator %}
{% for num in paginator.page_range %}
{% if page_obj.number == num %}
<a href="#" class="current">{{ num }}</a>
{% elif num > page_obj.number|add:'-3' and num < page_obj.number|add:'3' %}
{% if selected_website %}
<a href="?website=
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ num }}">{{ num }}</a>
{% else %}
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ num }}">{{ num }}</a>
{% endif %}
{% elif num == 1 or num == paginator.num_pages %}
{% if selected_website %}
<a href="?website=
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ num }}">{{ num }}</a>
{% else %}
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ num }}">{{ num }}</a>
{% endif %}
{% elif num == page_obj.number|add:'-3' or num == page_obj.number|add:'3' %}
<span class="ellipsis">...</span>
{% endif %}
{% endfor %}
{% endwith %}
{% if page_obj.has_next %}
{% if selected_website %}
<a href="?website=
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.next_page_number }}">下一页</a>
<a href="?website=
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.paginator.num_pages }}">末页
&raquo;</a>
{% else %}
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.next_page_number }}">下一页</a>
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.paginator.num_pages }}">末页
&raquo;</a>
{% endif %}
{% endif %}
</div>
</div>
</div>
</div>
<script>
// 存储当前任务ID和检查状态的定时器
let currentTaskId = null;
let statusCheckInterval = null;
// 获取页面元素
const runCrawlerBtn = document.getElementById('runCrawlerBtn');
const runDongfangyancaoCrawlerBtn = document.getElementById('runDongfangyancaoCrawlerBtn');
const pauseCrawlerBtn = document.getElementById('pauseCrawlerBtn');
const progressContainer = document.getElementById('crawlerProgress');
const progressFill = document.getElementById('progressFill');
const progressText = document.getElementById('progressText');
const resultDiv = document.getElementById('crawlerResult');
// 绑定爬虫按钮事件
runCrawlerBtn.addEventListener('click', function () {
runCrawler('www.news.cn', 'crawl_xinhua');
});
runDongfangyancaoCrawlerBtn.addEventListener('click', function () {
runCrawler('东方烟草报', 'crawl_dongfangyancao');
});
// 暂停按钮事件
pauseCrawlerBtn.addEventListener('click', function () {
if (currentTaskId) {
pauseCrawler(currentTaskId);
}
});
// 运行爬虫函数
function runCrawler(websiteName, crawlerName) {
// 禁用按钮,防止重复点击
runCrawlerBtn.disabled = true;
runDongfangyancaoCrawlerBtn.disabled = true;
resultDiv.style.display = 'none';
// 显示进度区域
progressContainer.style.display = 'block';
updateProgress(0, '爬虫启动中...');
// 发送POST请求运行爬虫
fetch('{% url "run_crawler" %}', {
method: 'POST',
headers: {
'Content-Type': 'application/x-www-form-urlencoded',
'X-CSRFToken': '{{ csrf_token }}'
},
body: 'crawler_name=' + crawlerName
})
.then(response => response.json())
.then(data => {
if (data.status === 'success') {
currentTaskId = data.task_id;
pauseCrawlerBtn.disabled = false;
// 启动轮询检查爬虫状态
checkCrawlerStatus(currentTaskId);
} else {
// 显示错误信息
resultDiv.style.display = 'block';
resultDiv.className = 'crawler-result error';
resultDiv.textContent = data.message;
// 恢复按钮状态
runCrawlerBtn.disabled = false;
runDongfangyancaoCrawlerBtn.disabled = false;
progressContainer.style.display = 'none';
}
})
.catch(error => {
// 显示错误信息
resultDiv.style.display = 'block';
resultDiv.className = 'crawler-result error';
resultDiv.textContent = '请求失败: ' + error;
// 恢复按钮状态
runCrawlerBtn.disabled = false;
runDongfangyancaoCrawlerBtn.disabled = false;
progressContainer.style.display = 'none';
});
}
// 检查爬虫状态的函数
function checkCrawlerStatus(taskId) {
// 清除之前的定时器
if (statusCheckInterval) {
clearInterval(statusCheckInterval);
}
// 设置新的定时器,每秒检查一次状态
statusCheckInterval = setInterval(() => {
fetch('{% url "crawler_status" %}', {
method: 'POST',
headers: {
'Content-Type': 'application/x-www-form-urlencoded',
'X-CSRFToken': '{{ csrf_token }}'
},
body: 'task_id=' + taskId
})
.then(response => response.json())
.then(data => {
if (data.status === 'running') {
// 更新进度信息(模拟进度)
const elapsedTime = new Date() - new Date(data.start_time);
const progress = Math.min(90, Math.floor(elapsedTime / 1000));
updateProgress(progress, data.message);
} else if (data.status === 'completed') {
// 爬虫完成,显示结果
clearInterval(statusCheckInterval);
updateProgress(100, data.message);
// 恢复按钮状态
runCrawlerBtn.disabled = false;
runDongfangyancaoCrawlerBtn.disabled = false;
pauseCrawlerBtn.disabled = true;
// 显示结果信息
resultDiv.style.display = 'block';
resultDiv.className = 'crawler-result success';
resultDiv.textContent = data.message;
// 3秒后自动隐藏进度条
setTimeout(() => {
progressContainer.style.display = 'none';
}, 3000);
// 自动刷新页面以显示新文章
setTimeout(() => {
location.reload();
}, 2000);
} else if (data.status === 'paused') {
// 爬虫暂停
clearInterval(statusCheckInterval);
updateProgress(data.progress || 0, '爬虫已暂停');
// 恢复按钮状态
runCrawlerBtn.disabled = false;
runDongfangyancaoCrawlerBtn.disabled = false;
pauseCrawlerBtn.disabled = true;
// 显示结果信息
resultDiv.style.display = 'block';
resultDiv.className = 'crawler-result success';
resultDiv.textContent = '爬虫已暂停';
} else if (data.status === 'error') {
// 爬虫出错
clearInterval(statusCheckInterval);
progressContainer.style.display = 'none';
resultDiv.style.display = 'block';
resultDiv.className = 'crawler-result error';
resultDiv.textContent = data.message;
// 恢复按钮状态
runCrawlerBtn.disabled = false;
runDongfangyancaoCrawlerBtn.disabled = false;
pauseCrawlerBtn.disabled = true;
}
})
.catch(error => {
clearInterval(statusCheckInterval);
progressContainer.style.display = 'none';
resultDiv.style.display = 'block';
resultDiv.className = 'crawler-result error';
resultDiv.textContent = '检查状态失败: ' + error;
// 恢复按钮状态
runCrawlerBtn.disabled = false;
runDongfangyancaoCrawlerBtn.disabled = false;
pauseCrawlerBtn.disabled = true;
});
}, 1000);
}
// 更新进度条函数
function updateProgress(percent, text) {
progressFill.style.width = percent + '%';
progressText.textContent = text;
}
// 暂停爬虫函数
function pauseCrawler(taskId) {
fetch('{% url "pause_crawler" %}', {
method: 'POST',
headers: {
'Content-Type': 'application/x-www-form-urlencoded',
'X-CSRFToken': '{{ csrf_token }}'
},
body: 'task_id=' + taskId
})
.then(response => response.json())
.then(data => {
if (data.status === 'success') {
// 暂停成功更新UI
pauseCrawlerBtn.disabled = true;
updateProgress(data.progress || 0, '正在暂停...');
} else {
// 显示错误信息
resultDiv.style.display = 'block';
resultDiv.className = 'crawler-result error';
resultDiv.textContent = data.message;
}
})
.catch(error => {
// 显示错误信息
resultDiv.style.display = 'block';
resultDiv.className = 'crawler-result error';
resultDiv.textContent = '暂停请求失败: ' + error;
});
}
// 导出功能相关JavaScript
const checkboxes = document.querySelectorAll('.article-checkbox');
const exportJsonBtn = document.getElementById('exportJsonBtn');
const exportCsvBtn = document.getElementById('exportCsvBtn');
const selectAllBtn = document.getElementById('selectAllBtn');
const deselectAllBtn = document.getElementById('deselectAllBtn');
// 新增:获取ZIP导出按钮元素
const exportZipBtn = document.getElementById('exportZipBtn');
// 更新导出按钮状态
function updateExportButtons() {
const selectedCount = document.querySelectorAll('.article-checkbox:checked').length;
exportJsonBtn.disabled = selectedCount === 0;
exportCsvBtn.disabled = selectedCount === 0;
exportZipBtn.disabled = selectedCount === 0; // 新增:更新ZIP导出按钮状态
}
// 为所有复选框添加事件监听器
checkboxes.forEach(checkbox => {
checkbox.addEventListener('change', updateExportButtons);
});
// 全选功能
selectAllBtn.addEventListener('click', () => {
checkboxes.forEach(checkbox => {
checkbox.checked = true;
});
updateExportButtons();
});
// 取消全选功能
deselectAllBtn.addEventListener('click', () => {
checkboxes.forEach(checkbox => {
checkbox.checked = false;
});
updateExportButtons();
});
// 导出为JSON功能
exportJsonBtn.addEventListener('click', () => {
const selectedArticles = Array.from(document.querySelectorAll('.article-checkbox:checked'))
.map(checkbox => checkbox.value);
// 发送POST请求导出文章
fetch('{% url "export_articles" %}', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'X-CSRFToken': '{{ csrf_token }}'
},
body: JSON.stringify({
article_ids: selectedArticles,
format: 'json'
})
})
.then(response => {
if (response.ok) {
return response.blob();
}
throw new Error('导出失败');
})
.then(blob => {
const url = window.URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = 'articles.json';
document.body.appendChild(a);
a.click();
window.URL.revokeObjectURL(url);
document.body.removeChild(a);
})
.catch(error => {
alert('导出失败: ' + error);
});
});
// 导出为CSV功能
exportCsvBtn.addEventListener('click', () => {
const selectedArticles = Array.from(document.querySelectorAll('.article-checkbox:checked'))
.map(checkbox => checkbox.value);
// 发送POST请求导出文章
fetch('{% url "export_articles" %}', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'X-CSRFToken': '{{ csrf_token }}'
},
body: JSON.stringify({
article_ids: selectedArticles,
format: 'csv'
})
})
.then(response => {
if (response.ok) {
return response.blob();
}
throw new Error('导出失败');
})
.then(blob => {
const url = window.URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = 'articles.csv';
document.body.appendChild(a);
a.click();
window.URL.revokeObjectURL(url);
document.body.removeChild(a);
})
.catch(error => {
alert('导出失败: ' + error);
});
});
// 新增:导出为ZIP包功能
exportZipBtn.addEventListener('click', () => {
const selectedArticles = Array.from(document.querySelectorAll('.article-checkbox:checked'))
.map(checkbox => checkbox.value);
// 发送POST请求导出文章为ZIP包
fetch('{% url "export_articles" %}', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'X-CSRFToken': '{{ csrf_token }}'
},
body: JSON.stringify({
article_ids: selectedArticles,
format: 'zip' // 指定导出格式为ZIP
})
})
.then(response => {
if (response.ok) {
return response.blob();
}
throw new Error('导出失败');
})
.then(blob => {
const url = window.URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = 'articles.zip';
document.body.appendChild(a);
a.click();
window.URL.revokeObjectURL(url);
document.body.removeChild(a);
})
.catch(error => {
alert('导出失败: ' + error);
});
});
// 初始化导出按钮状态
updateExportButtons();
</script>
</body>
</html>

View File

@@ -17,7 +17,7 @@
background: white;
padding: 30px;
margin-bottom: 20px;
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.05); /* 添加轻微阴影 */
box-shadow: 0 2px 5px rgba(0,0,0,0.05); /* 添加轻微阴影 */
border-radius: 8px; /* 添加圆角 */
}
@@ -251,11 +251,9 @@
<div class="sidebar">
<div class="filters">
<strong>按网站筛选:</strong>
<a href="{% url 'article_list' %}{% if search_query %}?q={{ search_query }}{% endif %}"
{% if not selected_website %}class="active" {% endif %}>全部</a>
<a href="{% url 'article_list' %}{% if search_query %}?q={{ search_query }}{% endif %}" {% if not selected_website %}class="active" {% endif %}>全部</a>
{% for website in websites %}
<a href="?website={{ website.id }}{% if search_query %}&q={{ search_query }}{% endif %}"
{% if selected_website and selected_website.id == website.id %}class="active" {% endif %}>{{ website.name }}</a>
<a href="?website={{ website.id }}{% if search_query %}&q={{ search_query }}{% endif %}" {% if selected_website and selected_website.id == website.id %}class="active" {% endif %}>{{ website.name }}</a>
{% endfor %}
</div>
</div>
@@ -283,8 +281,7 @@
<ul>
{% for article in page_obj %}
<li>
<input type="checkbox" class="article-checkbox" value="{{ article.id }}"
id="article_{{ article.id }}">
<input type="checkbox" class="article-checkbox" value="{{ article.id }}" id="article_{{ article.id }}">
<a href="{% url 'article_detail' article.id %}">{{ article.title }}</a>
<div class="meta">({{ article.website.name }} - {{ article.created_at|date:"Y-m-d" }})</div>
</li>
@@ -296,11 +293,8 @@
<div class="pagination">
{% if page_obj.has_previous %}
{% if selected_website %}
<a href="?website=
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page=1">&laquo;
首页</a>
<a href="?website=
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.previous_page_number }}">上一页</a>
<a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page=1">&laquo; 首页</a>
<a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.previous_page_number }}">上一页</a>
{% else %}
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page=1">&laquo; 首页</a>
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.previous_page_number }}">上一页</a>
@@ -316,15 +310,13 @@
<a href="#" class="current">{{ num }}</a>
{% elif num > page_obj.number|add:'-3' and num < page_obj.number|add:'3' %}
{% if selected_website %}
<a href="?website=
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ num }}">{{ num }}</a>
<a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ num }}">{{ num }}</a>
{% else %}
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ num }}">{{ num }}</a>
{% endif %}
{% elif num == 1 or num == paginator.num_pages %}
{% if selected_website %}
<a href="?website=
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ num }}">{{ num }}</a>
<a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ num }}">{{ num }}</a>
{% else %}
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ num }}">{{ num }}</a>
{% endif %}
@@ -336,15 +328,11 @@
{% if page_obj.has_next %}
{% if selected_website %}
<a href="?website=
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.next_page_number }}">下一页</a>
<a href="?website=
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.paginator.num_pages }}">末页
&raquo;</a>
<a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.next_page_number }}">下一页</a>
<a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.paginator.num_pages }}">末页 &raquo;</a>
{% else %}
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.next_page_number }}">下一页</a>
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.paginator.num_pages }}">末页
&raquo;</a>
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.paginator.num_pages }}">末页 &raquo;</a>
{% endif %}
{% endif %}
</div>

View File

@@ -107,6 +107,17 @@ def process_article(url, website):
soup.find("div", id="content") or
soup.find("div", class_="mainBody")
)
elif website.name == "人民日报":
# 人民日报网站的文章结构处理
title_tag = soup.find("h1") or soup.find("title")
# 查找主要内容区域
content_tag = (
soup.find("div", class_="content") or
soup.find("div", class_="article-content") or
soup.find("div", id="content") or
soup.find("div", class_="text") or
soup.find("section", class_="content")
)
else:
# 默认处理方式
title_tag = soup.find("h1") or soup.find("title")
@@ -256,6 +267,24 @@ def full_site_crawler(start_url, website, max_pages=1000):
("/xinwen/" in path) or
("/huoban/" in path)
)
elif website.name == "人民日报":
# 人民日报的文章页面判断逻辑
parsed_url = urlparse(url)
path = parsed_url.path
# 修改: 增加更准确的文章页面判断逻辑
is_article_page = (
(soup.find("div", class_="content") is not None and
soup.find("h1") is not None) or
soup.find("div", class_="article-content") is not None or
(soup.find("div", id="content") is not None and
soup.find("h1") is not None) or
soup.find("div", class_="text") is not None or
soup.find("section", class_="content") is not None or
("/article/" in path) or
(path.startswith("/detail/") and len(path) > 10) or
# 增加对peopleapp.com特定文章路径的判断
("/dynamic/" in path and "article" in path)
)
else:
# 默认判断逻辑
is_article_page = (
@@ -271,5 +300,18 @@ def full_site_crawler(start_url, website, max_pages=1000):
# 扩展队列,发现新链接
for link in soup.find_all("a", href=True):
href = urljoin(url, link["href"])
if href not in visited and is_valid_url(href, base_netloc):
# 对于人民日报网站,我们扩展链接发现逻辑
if website.name == "人民日报":
# 允许爬取以https://www.peopleapp.com/开头的链接
if href.startswith("https://www.peopleapp.com/") and href not in visited:
# 增加对文章链接的识别
parsed_href = urlparse(href)
href_path = parsed_href.path
# 添加更多可能的文章链接模式
if ("/article/" in href_path or
href_path.startswith("/detail/") or
("/dynamic/" in href_path and "article" in href_path) or
href_path.count("/") > 2): # 更深层的页面可能是文章页
queue.append(href)
elif href not in visited and is_valid_url(href, base_netloc):
queue.append(href)

View File

@@ -22,6 +22,7 @@ def article_list(request):
# 获取筛选网站
selected_website = None
# 修改:确保始终获取所有文章,除非有特定筛选
articles = Article.objects.all()
website_id = request.GET.get('website')