This commit is contained in:
2025-08-13 21:35:11 +08:00
parent c618528a0a
commit 31d0525cd0
10 changed files with 243 additions and 897 deletions

View File

@@ -1,9 +1,6 @@
from django.contrib import admin
from django.contrib.admin import AdminSite
from .models import Website, Article from .models import Website, Article
# 添加actions相关的导入 # 添加actions相关的导入
from django.contrib import messages from django.contrib import messages
from django.http import HttpResponseRedirect
# 添加导出功能所需导入 # 添加导出功能所需导入
import csv import csv
from django.http import HttpResponse from django.http import HttpResponse
@@ -12,17 +9,7 @@ import json
from django.shortcuts import render, redirect from django.shortcuts import render, redirect
from django.urls import path from django.urls import path
from django.contrib import admin from django.contrib import admin
from django.http import JsonResponse
from django.views.decorators.http import require_http_methods
from django.core.management import call_command from django.core.management import call_command
import threading
import uuid
from django.utils import timezone
# 创建自定义管理站点
# 实例化管理站点
# 添加运行爬虫的视图函数 # 添加运行爬虫的视图函数
def run_crawler_view(request): def run_crawler_view(request):
@@ -36,21 +23,24 @@ def run_crawler_view(request):
return redirect('admin:core_article_changelist') return redirect('admin:core_article_changelist')
try: try:
# 根据网站名称确定要执行的爬虫命令 # 动态获取网站对象
if website_name == 'crawl_xinhua': website = Website.objects.get(name=website_name)
crawler_name = 'crawl_xinhua'
elif website_name == 'crawl_dongfangyancao':
crawler_name = 'crawl_dongfangyancao'
elif website_name == 'crawl_articles':
crawler_name = 'crawl_articles'
else:
# 对于其他网站,使用通用爬虫命令
crawler_name = 'crawl_articles'
# 运行爬虫命令不传递website_name作为参数 # 根据网站对象确定要执行的爬虫命令
call_command(crawler_name) # 移除默认的通用爬虫,每个网站必须配置自己的爬虫命令
crawler_name = getattr(website, 'crawler_command', None)
messages.success(request, f'成功执行爬虫: {crawler_name}') # 如果网站没有配置爬虫命令,则报错
if not crawler_name:
messages.error(request, f'网站 {website_name} 未配置爬虫命令')
return redirect('admin:core_article_changelist')
# 运行爬虫命令,传递网站名称
call_command(crawler_name, website_name)
messages.success(request, f'成功执行爬虫: {website_name}')
except Website.DoesNotExist:
messages.error(request, f'网站不存在: {website_name}')
except Exception as e: except Exception as e:
messages.error(request, f'执行爬虫失败: {str(e)}') messages.error(request, f'执行爬虫失败: {str(e)}')
@@ -71,6 +61,10 @@ class ArticleAdmin(admin.ModelAdmin):
actions = ['delete_selected_articles', 'export_as_csv', 'export_as_json', actions = ['delete_selected_articles', 'export_as_csv', 'export_as_json',
'export_as_word', 'export_with_media'] 'export_as_word', 'export_with_media']
def get_websites(self):
"""获取所有启用的网站"""
return Website.objects.filter(enabled=True)
# 重写get_urls方法添加自定义URL # 重写get_urls方法添加自定义URL
def get_urls(self): def get_urls(self):
urls = super().get_urls() urls = super().get_urls()

View File

@@ -8,7 +8,7 @@ class Command(BaseCommand):
def handle(self, *args, **kwargs): def handle(self, *args, **kwargs):
website, created = Website.objects.get_or_create( website, created = Website.objects.get_or_create(
name="www.gov.cn", name="中国政府网",
defaults={ defaults={
'article_list_url': 'https://www.gov.cn/', 'article_list_url': 'https://www.gov.cn/',
'article_selector': 'a' 'article_selector': 'a'

View File

@@ -0,0 +1,26 @@
from django.core.management.base import BaseCommand
from core.models import Website
from core.utils import full_site_crawler
class Command(BaseCommand):
help = "全站递归爬取 人民日报 https://www.peopleapp.com"
def handle(self, *args, **kwargs):
website, created = Website.objects.get_or_create(
name="人民日报",
defaults={
'article_list_url': 'https://www.peopleapp.com/home',
'article_selector': 'a',
'base_url': 'https://www.peopleapp.com'
}
)
# 确保更新已存在的网站对象的base_url
if not created and not website.base_url:
website.base_url = 'https://www.peopleapp.com'
website.save()
start_url = "https://www.peopleapp.com/home"
self.stdout.write(f"开始全站爬取: {start_url}")
full_site_crawler(start_url, website, max_pages=500)
self.stdout.write("爬取完成")

View File

@@ -8,7 +8,7 @@ class Command(BaseCommand):
def handle(self, *args, **kwargs): def handle(self, *args, **kwargs):
website, created = Website.objects.get_or_create( website, created = Website.objects.get_or_create(
name="www.news.cn", name="新华网",
defaults={ defaults={
'article_list_url': 'https://www.news.cn/', 'article_list_url': 'https://www.news.cn/',
'article_selector': 'a' 'article_selector': 'a'

View File

@@ -4,7 +4,6 @@ import json
import csv import csv
import os import os
from django.conf import settings from django.conf import settings
from django.core.files.storage import default_storage
import zipfile import zipfile
from django.utils import timezone from django.utils import timezone

View File

@@ -9,9 +9,9 @@
<label for="website-select">选择网站:</label> <label for="website-select">选择网站:</label>
<select name="website_name" id="website-select" required> <select name="website_name" id="website-select" required>
<option value="">-- 请选择网站 --</option> <option value="">-- 请选择网站 --</option>
<option value="crawl_xinhua">新华网</option> {% for website in cl.model_admin.get_websites %}
<option value="crawl_dongfangyancao">东方烟草报</option> <option value="{{ website.name }}">{{ website.name }}</option>
<option value="crawl_articles">通用爬虫</option> {% endfor %}
</select> </select>
<input type="submit" value="执行爬虫" class="default" style="margin-left: 10px;"/> <input type="submit" value="执行爬虫" class="default" style="margin-left: 10px;"/>
</form> </form>

View File

@@ -2,7 +2,8 @@
<html lang="zh"> <html lang="zh">
<head> <head>
<meta charset="UTF-8"/> <meta charset="UTF-8"/>
<title>绿色课堂文章列表</title> <meta name="viewport" content="width=device-width, initial-scale=1.0"/>
<title>{{ article.title }} - 绿色课堂</title>
<style> <style>
body { body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif; font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
@@ -10,15 +11,16 @@
color: #333; color: #333;
margin: 0 auto; margin: 0 auto;
padding: 20px; padding: 20px;
background-color: #f0f8ff; /* 统一背景色调 */ background-color: #f0f8ff;
max-width: 800px;
} }
.container { .container {
background: white; background: white;
padding: 30px; padding: 30px;
margin-bottom: 20px; margin-bottom: 20px;
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.05); /* 添加轻微阴影 */ box-shadow: 0 2px 5px rgba(0, 0, 0, 0.05);
border-radius: 8px; /* 添加圆角 */ border-radius: 8px;
} }
h1 { h1 {
@@ -28,765 +30,59 @@
margin-top: 0; margin-top: 0;
} }
.filters { .meta {
color: #78909c;
font-size: 0.9em;
margin-bottom: 20px; margin-bottom: 20px;
padding: 15px;
background-color: #e3f2fd; /* 统一滤镜背景色调 */
border-radius: 5px;
} }
.filters a { .content {
margin-top: 20px;
}
.content img {
max-width: 100%;
height: auto;
}
.back-link {
display: inline-block; display: inline-block;
padding: 5px 10px; margin-bottom: 20px;
margin: 0 5px 5px 0; color: #1976d2;
background-color: #bbdefb; /* 统一链接背景色调 */ text-decoration: none;
}
.back-link:hover {
color: #0d47a1; color: #0d47a1;
text-decoration: none;
border-radius: 3px;
}
.filters a.active {
background-color: #3498db;
color: white;
}
ul {
list-style: none;
padding: 0;
}
li {
padding: 10px 0;
border-bottom: 1px solid #e0e0e0; /* 统一分隔线颜色 */
}
li:last-child {
border-bottom: none;
}
a {
color: #1976d2; /* 统一链接颜色 */
text-decoration: none;
}
a:hover {
color: #0d47a1; /* 统一悬停颜色 */
text-decoration: underline; text-decoration: underline;
} }
.meta { @media (max-width: 600px) {
color: #78909c; /* 统一元数据颜色 */ body {
font-size: 0.9em;
}
.pagination {
margin-top: 30px;
text-align: center;
padding: 20px 0;
}
.pagination a {
display: inline-block;
padding: 8px 16px;
background-color: #3498db;
color: white;
text-decoration: none;
border-radius: 4px;
margin: 0 2px; /* 修改:调整页码间距 */
}
.pagination a:hover {
background-color: #2980b9;
}
.pagination span {
margin: 0 10px;
color: #7f8c8d;
}
/* 新增:当前页码样式 */
.pagination .current {
background-color: #2980b9;
cursor: default;
}
/* 新增:省略号样式 */
.pagination .ellipsis {
display: inline-block;
padding: 8px 4px;
color: #7f8c8d;
}
/* 新增:搜索框样式 */
.search-form {
margin-bottom: 20px;
padding: 15px;
background-color: #e3f2fd; /* 统一搜索框背景色调 */
border-radius: 5px;
}
.search-form input[type="text"] {
padding: 8px 12px;
border: 1px solid #bbdefb; /* 统一边框颜色 */
border-radius: 4px;
width: 300px;
margin-right: 10px;
background-color: #fff;
}
.search-form input[type="submit"] {
padding: 8px 16px;
background-color: #3498db;
color: white;
border: none;
border-radius: 4px;
cursor: pointer;
}
.search-form input[type="submit"]:hover {
background-color: #2980b9;
}
.search-info {
color: #78909c; /* 统一搜索信息颜色 */
font-size: 0.9em;
margin-bottom: 10px;
}
/* 新增:左侧筛选栏样式 */
.content-wrapper {
display: flex;
gap: 20px;
}
.sidebar {
flex: 0 0 200px;
background-color: #e3f2fd; /* 统一边栏背景色调 */
border-radius: 5px;
padding: 15px;
}
.main-content {
flex: 1;
}
.sidebar .filters {
margin-bottom: 20px;
padding: 0;
background-color: transparent;
}
.sidebar .filters strong {
display: block;
margin-bottom: 10px;
color: #2c3e50;
}
.sidebar .filters a {
display: block;
padding: 8px 10px;
margin: 0 0 5px 0;
background-color: #bbdefb; /* 统一边栏链接背景色调 */
color: #0d47a1;
text-decoration: none;
border-radius: 3px;
}
.sidebar .filters a.active {
background-color: #3498db;
color: white;
}
/* 新增:导出功能样式 */
.export-section {
margin-bottom: 20px;
padding: 15px;
background-color: #e8f5e9; /* 统一导出区域背景色调 */
border-radius: 5px;
text-align: center;
}
.export-btn {
padding: 10px 20px;
background-color: #4caf50; /* 统一按钮背景色调 */
color: white;
border: none;
border-radius: 4px;
cursor: pointer;
font-size: 16px;
margin: 0 5px;
}
.export-btn:hover {
background-color: #388e3c; /* 统一按钮悬停色调 */
}
.export-btn:disabled {
background-color: #9e9e9e; /* 统一禁用按钮色调 */
cursor: not-allowed;
}
.article-checkbox {
margin-right: 10px;
}
/* 新增:爬虫控制按钮样式 */
.crawler-control {
margin-bottom: 20px;
padding: 15px;
background-color: #fff3e0; /* 统一爬虫控制区域背景色调 */
border-radius: 5px;
text-align: center;
}
.crawler-btn {
padding: 10px 20px;
background-color: #ff9800; /* 统一爬虫按钮背景色调 */
color: white;
border: none;
border-radius: 4px;
cursor: pointer;
font-size: 16px;
margin: 0 5px;
}
.crawler-btn:hover {
background-color: #f57c00; /* 统一爬虫按钮悬停色调 */
}
.crawler-btn:disabled {
background-color: #9e9e9e; /* 统一禁用爬虫按钮色调 */
cursor: not-allowed;
}
.crawler-result {
margin-top: 10px;
padding: 10px; padding: 10px;
border-radius: 4px;
display: none;
} }
.crawler-result.success { .container {
background-color: #e8f5e9; padding: 15px;
color: #2e7d32;
} }
.crawler-result.error {
background-color: #ffebee;
color: #c62828;
}
/* 新增:进度条样式 */
.progress-container {
margin-top: 10px;
display: none;
}
.progress-bar {
width: 100%;
height: 20px;
background-color: #e0e0e0;
border-radius: 10px;
overflow: hidden;
}
.progress-fill {
height: 100%;
background-color: #4caf50;
width: 0%;
transition: width 0.3s ease;
}
.progress-text {
margin-top: 5px;
font-size: 14px;
color: #666;
} }
</style> </style>
</head> </head>
<body> <body>
<div class="container"> <div class="container">
<h1>绿色课堂文章列表</h1> <a href="{% url 'article_list' %}" class="back-link">&laquo; 返回文章列表</a>
<!-- 新增:搜索表单 --> <h1>{{ article.title }}</h1>
<div class="search-form">
<form method="get"> <div class="meta">
<input type="text" name="q" placeholder="输入关键词搜索文章..." value="{{ search_query }}"> 网站: {{ article.website.name }} |
{% if selected_website %} 发布时间: {{ article.pub_date|date:"Y-m-d H:i" }} |
<input type="hidden" name="website" value="{{ selected_website.id }}"> 创建时间: {{ article.created_at|date:"Y-m-d H:i" }}
{% endif %}
<input type="submit" value="搜索">
</form>
</div> </div>
<div class="content-wrapper"> <div class="content">
<!-- 左侧筛选栏 --> {{ article.content|safe }}
<div class="sidebar">
<div class="filters">
<strong>按网站筛选:</strong>
<a href="{% url 'article_list' %}{% if search_query %}?q={{ search_query }}{% endif %}"
{% if not selected_website %}class="active" {% endif %}>全部</a>
{% for website in websites %}
<a href="?website={{ website.id }}{% if search_query %}&q={{ search_query }}{% endif %}"
{% if selected_website and selected_website.id == website.id %}class="active" {% endif %}>{{ website.name }}</a>
{% endfor %}
</div> </div>
</div> </div>
<!-- 主内容区域 -->
<div class="main-content">
<!-- 新增:搜索结果信息 -->
{% if search_query %}
<div class="search-info">
搜索 "{{ search_query }}" 找到 {{ page_obj.paginator.count }} 篇文章
<a href="{% if selected_website %}?website={{ selected_website.id }}{% else %}{% url 'article_list' %}{% endif %}">清除搜索</a>
</div>
{% endif %}
<!-- 新增:导出功能 -->
<div class="export-section">
<button id="selectAllBtn" class="export-btn">全选</button>
<button id="deselectAllBtn" class="export-btn">取消全选</button>
<button id="exportJsonBtn" class="export-btn" disabled>导出为JSON</button>
<button id="exportCsvBtn" class="export-btn" disabled>导出为CSV</button>
<!-- 新增:导出为ZIP包按钮 -->
<button id="exportZipBtn" class="export-btn" disabled>导出为ZIP包</button>
</div>
<ul>
{% for article in page_obj %}
<li>
<input type="checkbox" class="article-checkbox" value="{{ article.id }}"
id="article_{{ article.id }}">
<a href="{% url 'article_detail' article.id %}">{{ article.title }}</a>
<div class="meta">({{ article.website.name }} - {{ article.created_at|date:"Y-m-d" }})</div>
</li>
{% empty %}
<li>暂无文章</li>
{% endfor %}
</ul>
<div class="pagination">
{% if page_obj.has_previous %}
{% if selected_website %}
<a href="?website=
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page=1">&laquo;
首页</a>
<a href="?website=
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.previous_page_number }}">上一页</a>
{% else %}
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page=1">&laquo; 首页</a>
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.previous_page_number }}">上一页</a>
{% endif %}
{% endif %}
<span>第 {{ page_obj.number }} 页,共 {{ page_obj.paginator.num_pages }} 页</span>
<!-- 修改:优化页码显示逻辑 -->
{% with page_obj.paginator as paginator %}
{% for num in paginator.page_range %}
{% if page_obj.number == num %}
<a href="#" class="current">{{ num }}</a>
{% elif num > page_obj.number|add:'-3' and num < page_obj.number|add:'3' %}
{% if selected_website %}
<a href="?website=
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ num }}">{{ num }}</a>
{% else %}
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ num }}">{{ num }}</a>
{% endif %}
{% elif num == 1 or num == paginator.num_pages %}
{% if selected_website %}
<a href="?website=
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ num }}">{{ num }}</a>
{% else %}
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ num }}">{{ num }}</a>
{% endif %}
{% elif num == page_obj.number|add:'-3' or num == page_obj.number|add:'3' %}
<span class="ellipsis">...</span>
{% endif %}
{% endfor %}
{% endwith %}
{% if page_obj.has_next %}
{% if selected_website %}
<a href="?website=
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.next_page_number }}">下一页</a>
<a href="?website=
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.paginator.num_pages }}">末页
&raquo;</a>
{% else %}
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.next_page_number }}">下一页</a>
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.paginator.num_pages }}">末页
&raquo;</a>
{% endif %}
{% endif %}
</div>
</div>
</div>
</div>
<script>
// 存储当前任务ID和检查状态的定时器
let currentTaskId = null;
let statusCheckInterval = null;
// 获取页面元素
const runCrawlerBtn = document.getElementById('runCrawlerBtn');
const runDongfangyancaoCrawlerBtn = document.getElementById('runDongfangyancaoCrawlerBtn');
const pauseCrawlerBtn = document.getElementById('pauseCrawlerBtn');
const progressContainer = document.getElementById('crawlerProgress');
const progressFill = document.getElementById('progressFill');
const progressText = document.getElementById('progressText');
const resultDiv = document.getElementById('crawlerResult');
// 绑定爬虫按钮事件
runCrawlerBtn.addEventListener('click', function () {
runCrawler('www.news.cn', 'crawl_xinhua');
});
runDongfangyancaoCrawlerBtn.addEventListener('click', function () {
runCrawler('东方烟草报', 'crawl_dongfangyancao');
});
// 暂停按钮事件
pauseCrawlerBtn.addEventListener('click', function () {
if (currentTaskId) {
pauseCrawler(currentTaskId);
}
});
// 运行爬虫函数
function runCrawler(websiteName, crawlerName) {
// 禁用按钮,防止重复点击
runCrawlerBtn.disabled = true;
runDongfangyancaoCrawlerBtn.disabled = true;
resultDiv.style.display = 'none';
// 显示进度区域
progressContainer.style.display = 'block';
updateProgress(0, '爬虫启动中...');
// 发送POST请求运行爬虫
fetch('{% url "run_crawler" %}', {
method: 'POST',
headers: {
'Content-Type': 'application/x-www-form-urlencoded',
'X-CSRFToken': '{{ csrf_token }}'
},
body: 'crawler_name=' + crawlerName
})
.then(response => response.json())
.then(data => {
if (data.status === 'success') {
currentTaskId = data.task_id;
pauseCrawlerBtn.disabled = false;
// 启动轮询检查爬虫状态
checkCrawlerStatus(currentTaskId);
} else {
// 显示错误信息
resultDiv.style.display = 'block';
resultDiv.className = 'crawler-result error';
resultDiv.textContent = data.message;
// 恢复按钮状态
runCrawlerBtn.disabled = false;
runDongfangyancaoCrawlerBtn.disabled = false;
progressContainer.style.display = 'none';
}
})
.catch(error => {
// 显示错误信息
resultDiv.style.display = 'block';
resultDiv.className = 'crawler-result error';
resultDiv.textContent = '请求失败: ' + error;
// 恢复按钮状态
runCrawlerBtn.disabled = false;
runDongfangyancaoCrawlerBtn.disabled = false;
progressContainer.style.display = 'none';
});
}
// 检查爬虫状态的函数
function checkCrawlerStatus(taskId) {
// 清除之前的定时器
if (statusCheckInterval) {
clearInterval(statusCheckInterval);
}
// 设置新的定时器,每秒检查一次状态
statusCheckInterval = setInterval(() => {
fetch('{% url "crawler_status" %}', {
method: 'POST',
headers: {
'Content-Type': 'application/x-www-form-urlencoded',
'X-CSRFToken': '{{ csrf_token }}'
},
body: 'task_id=' + taskId
})
.then(response => response.json())
.then(data => {
if (data.status === 'running') {
// 更新进度信息(模拟进度)
const elapsedTime = new Date() - new Date(data.start_time);
const progress = Math.min(90, Math.floor(elapsedTime / 1000));
updateProgress(progress, data.message);
} else if (data.status === 'completed') {
// 爬虫完成,显示结果
clearInterval(statusCheckInterval);
updateProgress(100, data.message);
// 恢复按钮状态
runCrawlerBtn.disabled = false;
runDongfangyancaoCrawlerBtn.disabled = false;
pauseCrawlerBtn.disabled = true;
// 显示结果信息
resultDiv.style.display = 'block';
resultDiv.className = 'crawler-result success';
resultDiv.textContent = data.message;
// 3秒后自动隐藏进度条
setTimeout(() => {
progressContainer.style.display = 'none';
}, 3000);
// 自动刷新页面以显示新文章
setTimeout(() => {
location.reload();
}, 2000);
} else if (data.status === 'paused') {
// 爬虫暂停
clearInterval(statusCheckInterval);
updateProgress(data.progress || 0, '爬虫已暂停');
// 恢复按钮状态
runCrawlerBtn.disabled = false;
runDongfangyancaoCrawlerBtn.disabled = false;
pauseCrawlerBtn.disabled = true;
// 显示结果信息
resultDiv.style.display = 'block';
resultDiv.className = 'crawler-result success';
resultDiv.textContent = '爬虫已暂停';
} else if (data.status === 'error') {
// 爬虫出错
clearInterval(statusCheckInterval);
progressContainer.style.display = 'none';
resultDiv.style.display = 'block';
resultDiv.className = 'crawler-result error';
resultDiv.textContent = data.message;
// 恢复按钮状态
runCrawlerBtn.disabled = false;
runDongfangyancaoCrawlerBtn.disabled = false;
pauseCrawlerBtn.disabled = true;
}
})
.catch(error => {
clearInterval(statusCheckInterval);
progressContainer.style.display = 'none';
resultDiv.style.display = 'block';
resultDiv.className = 'crawler-result error';
resultDiv.textContent = '检查状态失败: ' + error;
// 恢复按钮状态
runCrawlerBtn.disabled = false;
runDongfangyancaoCrawlerBtn.disabled = false;
pauseCrawlerBtn.disabled = true;
});
}, 1000);
}
// 更新进度条函数
function updateProgress(percent, text) {
progressFill.style.width = percent + '%';
progressText.textContent = text;
}
// 暂停爬虫函数
function pauseCrawler(taskId) {
fetch('{% url "pause_crawler" %}', {
method: 'POST',
headers: {
'Content-Type': 'application/x-www-form-urlencoded',
'X-CSRFToken': '{{ csrf_token }}'
},
body: 'task_id=' + taskId
})
.then(response => response.json())
.then(data => {
if (data.status === 'success') {
// 暂停成功更新UI
pauseCrawlerBtn.disabled = true;
updateProgress(data.progress || 0, '正在暂停...');
} else {
// 显示错误信息
resultDiv.style.display = 'block';
resultDiv.className = 'crawler-result error';
resultDiv.textContent = data.message;
}
})
.catch(error => {
// 显示错误信息
resultDiv.style.display = 'block';
resultDiv.className = 'crawler-result error';
resultDiv.textContent = '暂停请求失败: ' + error;
});
}
// 导出功能相关JavaScript
const checkboxes = document.querySelectorAll('.article-checkbox');
const exportJsonBtn = document.getElementById('exportJsonBtn');
const exportCsvBtn = document.getElementById('exportCsvBtn');
const selectAllBtn = document.getElementById('selectAllBtn');
const deselectAllBtn = document.getElementById('deselectAllBtn');
// 新增:获取ZIP导出按钮元素
const exportZipBtn = document.getElementById('exportZipBtn');
// 更新导出按钮状态
function updateExportButtons() {
const selectedCount = document.querySelectorAll('.article-checkbox:checked').length;
exportJsonBtn.disabled = selectedCount === 0;
exportCsvBtn.disabled = selectedCount === 0;
exportZipBtn.disabled = selectedCount === 0; // 新增:更新ZIP导出按钮状态
}
// 为所有复选框添加事件监听器
checkboxes.forEach(checkbox => {
checkbox.addEventListener('change', updateExportButtons);
});
// 全选功能
selectAllBtn.addEventListener('click', () => {
checkboxes.forEach(checkbox => {
checkbox.checked = true;
});
updateExportButtons();
});
// 取消全选功能
deselectAllBtn.addEventListener('click', () => {
checkboxes.forEach(checkbox => {
checkbox.checked = false;
});
updateExportButtons();
});
// 导出为JSON功能
exportJsonBtn.addEventListener('click', () => {
const selectedArticles = Array.from(document.querySelectorAll('.article-checkbox:checked'))
.map(checkbox => checkbox.value);
// 发送POST请求导出文章
fetch('{% url "export_articles" %}', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'X-CSRFToken': '{{ csrf_token }}'
},
body: JSON.stringify({
article_ids: selectedArticles,
format: 'json'
})
})
.then(response => {
if (response.ok) {
return response.blob();
}
throw new Error('导出失败');
})
.then(blob => {
const url = window.URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = 'articles.json';
document.body.appendChild(a);
a.click();
window.URL.revokeObjectURL(url);
document.body.removeChild(a);
})
.catch(error => {
alert('导出失败: ' + error);
});
});
// 导出为CSV功能
exportCsvBtn.addEventListener('click', () => {
const selectedArticles = Array.from(document.querySelectorAll('.article-checkbox:checked'))
.map(checkbox => checkbox.value);
// 发送POST请求导出文章
fetch('{% url "export_articles" %}', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'X-CSRFToken': '{{ csrf_token }}'
},
body: JSON.stringify({
article_ids: selectedArticles,
format: 'csv'
})
})
.then(response => {
if (response.ok) {
return response.blob();
}
throw new Error('导出失败');
})
.then(blob => {
const url = window.URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = 'articles.csv';
document.body.appendChild(a);
a.click();
window.URL.revokeObjectURL(url);
document.body.removeChild(a);
})
.catch(error => {
alert('导出失败: ' + error);
});
});
// 新增:导出为ZIP包功能
exportZipBtn.addEventListener('click', () => {
const selectedArticles = Array.from(document.querySelectorAll('.article-checkbox:checked'))
.map(checkbox => checkbox.value);
// 发送POST请求导出文章为ZIP包
fetch('{% url "export_articles" %}', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'X-CSRFToken': '{{ csrf_token }}'
},
body: JSON.stringify({
article_ids: selectedArticles,
format: 'zip' // 指定导出格式为ZIP
})
})
.then(response => {
if (response.ok) {
return response.blob();
}
throw new Error('导出失败');
})
.then(blob => {
const url = window.URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = 'articles.zip';
document.body.appendChild(a);
a.click();
window.URL.revokeObjectURL(url);
document.body.removeChild(a);
})
.catch(error => {
alert('导出失败: ' + error);
});
});
// 初始化导出按钮状态
updateExportButtons();
</script>
</body> </body>
</html> </html>

View File

@@ -17,7 +17,7 @@
background: white; background: white;
padding: 30px; padding: 30px;
margin-bottom: 20px; margin-bottom: 20px;
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.05); /* 添加轻微阴影 */ box-shadow: 0 2px 5px rgba(0,0,0,0.05); /* 添加轻微阴影 */
border-radius: 8px; /* 添加圆角 */ border-radius: 8px; /* 添加圆角 */
} }
@@ -251,11 +251,9 @@
<div class="sidebar"> <div class="sidebar">
<div class="filters"> <div class="filters">
<strong>按网站筛选:</strong> <strong>按网站筛选:</strong>
<a href="{% url 'article_list' %}{% if search_query %}?q={{ search_query }}{% endif %}" <a href="{% url 'article_list' %}{% if search_query %}?q={{ search_query }}{% endif %}" {% if not selected_website %}class="active" {% endif %}>全部</a>
{% if not selected_website %}class="active" {% endif %}>全部</a>
{% for website in websites %} {% for website in websites %}
<a href="?website={{ website.id }}{% if search_query %}&q={{ search_query }}{% endif %}" <a href="?website={{ website.id }}{% if search_query %}&q={{ search_query }}{% endif %}" {% if selected_website and selected_website.id == website.id %}class="active" {% endif %}>{{ website.name }}</a>
{% if selected_website and selected_website.id == website.id %}class="active" {% endif %}>{{ website.name }}</a>
{% endfor %} {% endfor %}
</div> </div>
</div> </div>
@@ -283,8 +281,7 @@
<ul> <ul>
{% for article in page_obj %} {% for article in page_obj %}
<li> <li>
<input type="checkbox" class="article-checkbox" value="{{ article.id }}" <input type="checkbox" class="article-checkbox" value="{{ article.id }}" id="article_{{ article.id }}">
id="article_{{ article.id }}">
<a href="{% url 'article_detail' article.id %}">{{ article.title }}</a> <a href="{% url 'article_detail' article.id %}">{{ article.title }}</a>
<div class="meta">({{ article.website.name }} - {{ article.created_at|date:"Y-m-d" }})</div> <div class="meta">({{ article.website.name }} - {{ article.created_at|date:"Y-m-d" }})</div>
</li> </li>
@@ -296,11 +293,8 @@
<div class="pagination"> <div class="pagination">
{% if page_obj.has_previous %} {% if page_obj.has_previous %}
{% if selected_website %} {% if selected_website %}
<a href="?website= <a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page=1">&laquo; 首页</a>
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page=1">&laquo; <a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.previous_page_number }}">上一页</a>
首页</a>
<a href="?website=
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.previous_page_number }}">上一页</a>
{% else %} {% else %}
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page=1">&laquo; 首页</a> <a href="?{% if search_query %}q={{ search_query }}&{% endif %}page=1">&laquo; 首页</a>
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.previous_page_number }}">上一页</a> <a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.previous_page_number }}">上一页</a>
@@ -316,15 +310,13 @@
<a href="#" class="current">{{ num }}</a> <a href="#" class="current">{{ num }}</a>
{% elif num > page_obj.number|add:'-3' and num < page_obj.number|add:'3' %} {% elif num > page_obj.number|add:'-3' and num < page_obj.number|add:'3' %}
{% if selected_website %} {% if selected_website %}
<a href="?website= <a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ num }}">{{ num }}</a>
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ num }}">{{ num }}</a>
{% else %} {% else %}
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ num }}">{{ num }}</a> <a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ num }}">{{ num }}</a>
{% endif %} {% endif %}
{% elif num == 1 or num == paginator.num_pages %} {% elif num == 1 or num == paginator.num_pages %}
{% if selected_website %} {% if selected_website %}
<a href="?website= <a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ num }}">{{ num }}</a>
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ num }}">{{ num }}</a>
{% else %} {% else %}
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ num }}">{{ num }}</a> <a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ num }}">{{ num }}</a>
{% endif %} {% endif %}
@@ -336,15 +328,11 @@
{% if page_obj.has_next %} {% if page_obj.has_next %}
{% if selected_website %} {% if selected_website %}
<a href="?website= <a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.next_page_number }}">下一页</a>
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.next_page_number }}">下一页</a> <a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.paginator.num_pages }}">末页 &raquo;</a>
<a href="?website=
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.paginator.num_pages }}">末页
&raquo;</a>
{% else %} {% else %}
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.next_page_number }}">下一页</a> <a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.next_page_number }}">下一页</a>
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.paginator.num_pages }}">末页 <a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.paginator.num_pages }}">末页 &raquo;</a>
&raquo;</a>
{% endif %} {% endif %}
{% endif %} {% endif %}
</div> </div>

View File

@@ -107,6 +107,17 @@ def process_article(url, website):
soup.find("div", id="content") or soup.find("div", id="content") or
soup.find("div", class_="mainBody") soup.find("div", class_="mainBody")
) )
elif website.name == "人民日报":
# 人民日报网站的文章结构处理
title_tag = soup.find("h1") or soup.find("title")
# 查找主要内容区域
content_tag = (
soup.find("div", class_="content") or
soup.find("div", class_="article-content") or
soup.find("div", id="content") or
soup.find("div", class_="text") or
soup.find("section", class_="content")
)
else: else:
# 默认处理方式 # 默认处理方式
title_tag = soup.find("h1") or soup.find("title") title_tag = soup.find("h1") or soup.find("title")
@@ -256,6 +267,24 @@ def full_site_crawler(start_url, website, max_pages=1000):
("/xinwen/" in path) or ("/xinwen/" in path) or
("/huoban/" in path) ("/huoban/" in path)
) )
elif website.name == "人民日报":
# 人民日报的文章页面判断逻辑
parsed_url = urlparse(url)
path = parsed_url.path
# 修改: 增加更准确的文章页面判断逻辑
is_article_page = (
(soup.find("div", class_="content") is not None and
soup.find("h1") is not None) or
soup.find("div", class_="article-content") is not None or
(soup.find("div", id="content") is not None and
soup.find("h1") is not None) or
soup.find("div", class_="text") is not None or
soup.find("section", class_="content") is not None or
("/article/" in path) or
(path.startswith("/detail/") and len(path) > 10) or
# 增加对peopleapp.com特定文章路径的判断
("/dynamic/" in path and "article" in path)
)
else: else:
# 默认判断逻辑 # 默认判断逻辑
is_article_page = ( is_article_page = (
@@ -271,5 +300,18 @@ def full_site_crawler(start_url, website, max_pages=1000):
# 扩展队列,发现新链接 # 扩展队列,发现新链接
for link in soup.find_all("a", href=True): for link in soup.find_all("a", href=True):
href = urljoin(url, link["href"]) href = urljoin(url, link["href"])
if href not in visited and is_valid_url(href, base_netloc): # 对于人民日报网站,我们扩展链接发现逻辑
if website.name == "人民日报":
# 允许爬取以https://www.peopleapp.com/开头的链接
if href.startswith("https://www.peopleapp.com/") and href not in visited:
# 增加对文章链接的识别
parsed_href = urlparse(href)
href_path = parsed_href.path
# 添加更多可能的文章链接模式
if ("/article/" in href_path or
href_path.startswith("/detail/") or
("/dynamic/" in href_path and "article" in href_path) or
href_path.count("/") > 2): # 更深层的页面可能是文章页
queue.append(href)
elif href not in visited and is_valid_url(href, base_netloc):
queue.append(href) queue.append(href)

View File

@@ -22,6 +22,7 @@ def article_list(request):
# 获取筛选网站 # 获取筛选网站
selected_website = None selected_website = None
# 修改:确保始终获取所有文章,除非有特定筛选
articles = Article.objects.all() articles = Article.objects.all()
website_id = request.GET.get('website') website_id = request.GET.get('website')