fix bugs
This commit is contained in:
@@ -1,9 +1,6 @@
|
|||||||
from django.contrib import admin
|
|
||||||
from django.contrib.admin import AdminSite
|
|
||||||
from .models import Website, Article
|
from .models import Website, Article
|
||||||
# 添加actions相关的导入
|
# 添加actions相关的导入
|
||||||
from django.contrib import messages
|
from django.contrib import messages
|
||||||
from django.http import HttpResponseRedirect
|
|
||||||
# 添加导出功能所需导入
|
# 添加导出功能所需导入
|
||||||
import csv
|
import csv
|
||||||
from django.http import HttpResponse
|
from django.http import HttpResponse
|
||||||
@@ -12,17 +9,7 @@ import json
|
|||||||
from django.shortcuts import render, redirect
|
from django.shortcuts import render, redirect
|
||||||
from django.urls import path
|
from django.urls import path
|
||||||
from django.contrib import admin
|
from django.contrib import admin
|
||||||
from django.http import JsonResponse
|
|
||||||
from django.views.decorators.http import require_http_methods
|
|
||||||
from django.core.management import call_command
|
from django.core.management import call_command
|
||||||
import threading
|
|
||||||
import uuid
|
|
||||||
from django.utils import timezone
|
|
||||||
|
|
||||||
|
|
||||||
# 创建自定义管理站点
|
|
||||||
|
|
||||||
# 实例化管理站点
|
|
||||||
|
|
||||||
# 添加运行爬虫的视图函数
|
# 添加运行爬虫的视图函数
|
||||||
def run_crawler_view(request):
|
def run_crawler_view(request):
|
||||||
@@ -36,21 +23,24 @@ def run_crawler_view(request):
|
|||||||
return redirect('admin:core_article_changelist')
|
return redirect('admin:core_article_changelist')
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# 根据网站名称确定要执行的爬虫命令
|
# 动态获取网站对象
|
||||||
if website_name == 'crawl_xinhua':
|
website = Website.objects.get(name=website_name)
|
||||||
crawler_name = 'crawl_xinhua'
|
|
||||||
elif website_name == 'crawl_dongfangyancao':
|
|
||||||
crawler_name = 'crawl_dongfangyancao'
|
|
||||||
elif website_name == 'crawl_articles':
|
|
||||||
crawler_name = 'crawl_articles'
|
|
||||||
else:
|
|
||||||
# 对于其他网站,使用通用爬虫命令
|
|
||||||
crawler_name = 'crawl_articles'
|
|
||||||
|
|
||||||
# 运行爬虫命令,不传递website_name作为参数
|
# 根据网站对象确定要执行的爬虫命令
|
||||||
call_command(crawler_name)
|
# 移除默认的通用爬虫,每个网站必须配置自己的爬虫命令
|
||||||
|
crawler_name = getattr(website, 'crawler_command', None)
|
||||||
|
|
||||||
messages.success(request, f'成功执行爬虫: {crawler_name}')
|
# 如果网站没有配置爬虫命令,则报错
|
||||||
|
if not crawler_name:
|
||||||
|
messages.error(request, f'网站 {website_name} 未配置爬虫命令')
|
||||||
|
return redirect('admin:core_article_changelist')
|
||||||
|
|
||||||
|
# 运行爬虫命令,传递网站名称
|
||||||
|
call_command(crawler_name, website_name)
|
||||||
|
|
||||||
|
messages.success(request, f'成功执行爬虫: {website_name}')
|
||||||
|
except Website.DoesNotExist:
|
||||||
|
messages.error(request, f'网站不存在: {website_name}')
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
messages.error(request, f'执行爬虫失败: {str(e)}')
|
messages.error(request, f'执行爬虫失败: {str(e)}')
|
||||||
|
|
||||||
@@ -71,6 +61,10 @@ class ArticleAdmin(admin.ModelAdmin):
|
|||||||
actions = ['delete_selected_articles', 'export_as_csv', 'export_as_json',
|
actions = ['delete_selected_articles', 'export_as_csv', 'export_as_json',
|
||||||
'export_as_word', 'export_with_media']
|
'export_as_word', 'export_with_media']
|
||||||
|
|
||||||
|
def get_websites(self):
|
||||||
|
"""获取所有启用的网站"""
|
||||||
|
return Website.objects.filter(enabled=True)
|
||||||
|
|
||||||
# 重写get_urls方法,添加自定义URL
|
# 重写get_urls方法,添加自定义URL
|
||||||
def get_urls(self):
|
def get_urls(self):
|
||||||
urls = super().get_urls()
|
urls = super().get_urls()
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ class Command(BaseCommand):
|
|||||||
|
|
||||||
def handle(self, *args, **kwargs):
|
def handle(self, *args, **kwargs):
|
||||||
website, created = Website.objects.get_or_create(
|
website, created = Website.objects.get_or_create(
|
||||||
name="www.gov.cn",
|
name="中国政府网",
|
||||||
defaults={
|
defaults={
|
||||||
'article_list_url': 'https://www.gov.cn/',
|
'article_list_url': 'https://www.gov.cn/',
|
||||||
'article_selector': 'a'
|
'article_selector': 'a'
|
||||||
|
|||||||
26
core/management/commands/crawl_rmrb.py
Normal file
26
core/management/commands/crawl_rmrb.py
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
from django.core.management.base import BaseCommand
|
||||||
|
from core.models import Website
|
||||||
|
from core.utils import full_site_crawler
|
||||||
|
|
||||||
|
|
||||||
|
class Command(BaseCommand):
|
||||||
|
help = "全站递归爬取 人民日报 https://www.peopleapp.com"
|
||||||
|
|
||||||
|
def handle(self, *args, **kwargs):
|
||||||
|
website, created = Website.objects.get_or_create(
|
||||||
|
name="人民日报",
|
||||||
|
defaults={
|
||||||
|
'article_list_url': 'https://www.peopleapp.com/home',
|
||||||
|
'article_selector': 'a',
|
||||||
|
'base_url': 'https://www.peopleapp.com'
|
||||||
|
}
|
||||||
|
)
|
||||||
|
# 确保更新已存在的网站对象的base_url
|
||||||
|
if not created and not website.base_url:
|
||||||
|
website.base_url = 'https://www.peopleapp.com'
|
||||||
|
website.save()
|
||||||
|
|
||||||
|
start_url = "https://www.peopleapp.com/home"
|
||||||
|
self.stdout.write(f"开始全站爬取: {start_url}")
|
||||||
|
full_site_crawler(start_url, website, max_pages=500)
|
||||||
|
self.stdout.write("爬取完成")
|
||||||
@@ -8,7 +8,7 @@ class Command(BaseCommand):
|
|||||||
|
|
||||||
def handle(self, *args, **kwargs):
|
def handle(self, *args, **kwargs):
|
||||||
website, created = Website.objects.get_or_create(
|
website, created = Website.objects.get_or_create(
|
||||||
name="www.news.cn",
|
name="新华网",
|
||||||
defaults={
|
defaults={
|
||||||
'article_list_url': 'https://www.news.cn/',
|
'article_list_url': 'https://www.news.cn/',
|
||||||
'article_selector': 'a'
|
'article_selector': 'a'
|
||||||
|
|||||||
@@ -4,7 +4,6 @@ import json
|
|||||||
import csv
|
import csv
|
||||||
import os
|
import os
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.core.files.storage import default_storage
|
|
||||||
import zipfile
|
import zipfile
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
|
|
||||||
|
|||||||
@@ -9,9 +9,9 @@
|
|||||||
<label for="website-select">选择网站:</label>
|
<label for="website-select">选择网站:</label>
|
||||||
<select name="website_name" id="website-select" required>
|
<select name="website_name" id="website-select" required>
|
||||||
<option value="">-- 请选择网站 --</option>
|
<option value="">-- 请选择网站 --</option>
|
||||||
<option value="crawl_xinhua">新华网</option>
|
{% for website in cl.model_admin.get_websites %}
|
||||||
<option value="crawl_dongfangyancao">东方烟草报</option>
|
<option value="{{ website.name }}">{{ website.name }}</option>
|
||||||
<option value="crawl_articles">通用爬虫</option>
|
{% endfor %}
|
||||||
</select>
|
</select>
|
||||||
<input type="submit" value="执行爬虫" class="default" style="margin-left: 10px;"/>
|
<input type="submit" value="执行爬虫" class="default" style="margin-left: 10px;"/>
|
||||||
</form>
|
</form>
|
||||||
|
|||||||
@@ -2,7 +2,8 @@
|
|||||||
<html lang="zh">
|
<html lang="zh">
|
||||||
<head>
|
<head>
|
||||||
<meta charset="UTF-8"/>
|
<meta charset="UTF-8"/>
|
||||||
<title>绿色课堂文章列表</title>
|
<meta name="viewport" content="width=device-width, initial-scale=1.0"/>
|
||||||
|
<title>{{ article.title }} - 绿色课堂</title>
|
||||||
<style>
|
<style>
|
||||||
body {
|
body {
|
||||||
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
|
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
|
||||||
@@ -10,15 +11,16 @@
|
|||||||
color: #333;
|
color: #333;
|
||||||
margin: 0 auto;
|
margin: 0 auto;
|
||||||
padding: 20px;
|
padding: 20px;
|
||||||
background-color: #f0f8ff; /* 统一背景色调 */
|
background-color: #f0f8ff;
|
||||||
|
max-width: 800px;
|
||||||
}
|
}
|
||||||
|
|
||||||
.container {
|
.container {
|
||||||
background: white;
|
background: white;
|
||||||
padding: 30px;
|
padding: 30px;
|
||||||
margin-bottom: 20px;
|
margin-bottom: 20px;
|
||||||
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.05); /* 添加轻微阴影 */
|
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.05);
|
||||||
border-radius: 8px; /* 添加圆角 */
|
border-radius: 8px;
|
||||||
}
|
}
|
||||||
|
|
||||||
h1 {
|
h1 {
|
||||||
@@ -28,765 +30,59 @@
|
|||||||
margin-top: 0;
|
margin-top: 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
.filters {
|
.meta {
|
||||||
|
color: #78909c;
|
||||||
|
font-size: 0.9em;
|
||||||
margin-bottom: 20px;
|
margin-bottom: 20px;
|
||||||
padding: 15px;
|
|
||||||
background-color: #e3f2fd; /* 统一滤镜背景色调 */
|
|
||||||
border-radius: 5px;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
.filters a {
|
.content {
|
||||||
|
margin-top: 20px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.content img {
|
||||||
|
max-width: 100%;
|
||||||
|
height: auto;
|
||||||
|
}
|
||||||
|
|
||||||
|
.back-link {
|
||||||
display: inline-block;
|
display: inline-block;
|
||||||
padding: 5px 10px;
|
margin-bottom: 20px;
|
||||||
margin: 0 5px 5px 0;
|
color: #1976d2;
|
||||||
background-color: #bbdefb; /* 统一链接背景色调 */
|
text-decoration: none;
|
||||||
|
}
|
||||||
|
|
||||||
|
.back-link:hover {
|
||||||
color: #0d47a1;
|
color: #0d47a1;
|
||||||
text-decoration: none;
|
|
||||||
border-radius: 3px;
|
|
||||||
}
|
|
||||||
|
|
||||||
.filters a.active {
|
|
||||||
background-color: #3498db;
|
|
||||||
color: white;
|
|
||||||
}
|
|
||||||
|
|
||||||
ul {
|
|
||||||
list-style: none;
|
|
||||||
padding: 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
li {
|
|
||||||
padding: 10px 0;
|
|
||||||
border-bottom: 1px solid #e0e0e0; /* 统一分隔线颜色 */
|
|
||||||
}
|
|
||||||
|
|
||||||
li:last-child {
|
|
||||||
border-bottom: none;
|
|
||||||
}
|
|
||||||
|
|
||||||
a {
|
|
||||||
color: #1976d2; /* 统一链接颜色 */
|
|
||||||
text-decoration: none;
|
|
||||||
}
|
|
||||||
|
|
||||||
a:hover {
|
|
||||||
color: #0d47a1; /* 统一悬停颜色 */
|
|
||||||
text-decoration: underline;
|
text-decoration: underline;
|
||||||
}
|
}
|
||||||
|
|
||||||
.meta {
|
@media (max-width: 600px) {
|
||||||
color: #78909c; /* 统一元数据颜色 */
|
body {
|
||||||
font-size: 0.9em;
|
|
||||||
}
|
|
||||||
|
|
||||||
.pagination {
|
|
||||||
margin-top: 30px;
|
|
||||||
text-align: center;
|
|
||||||
padding: 20px 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
.pagination a {
|
|
||||||
display: inline-block;
|
|
||||||
padding: 8px 16px;
|
|
||||||
background-color: #3498db;
|
|
||||||
color: white;
|
|
||||||
text-decoration: none;
|
|
||||||
border-radius: 4px;
|
|
||||||
margin: 0 2px; /* 修改:调整页码间距 */
|
|
||||||
}
|
|
||||||
|
|
||||||
.pagination a:hover {
|
|
||||||
background-color: #2980b9;
|
|
||||||
}
|
|
||||||
|
|
||||||
.pagination span {
|
|
||||||
margin: 0 10px;
|
|
||||||
color: #7f8c8d;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 新增:当前页码样式 */
|
|
||||||
.pagination .current {
|
|
||||||
background-color: #2980b9;
|
|
||||||
cursor: default;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 新增:省略号样式 */
|
|
||||||
.pagination .ellipsis {
|
|
||||||
display: inline-block;
|
|
||||||
padding: 8px 4px;
|
|
||||||
color: #7f8c8d;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 新增:搜索框样式 */
|
|
||||||
.search-form {
|
|
||||||
margin-bottom: 20px;
|
|
||||||
padding: 15px;
|
|
||||||
background-color: #e3f2fd; /* 统一搜索框背景色调 */
|
|
||||||
border-radius: 5px;
|
|
||||||
}
|
|
||||||
|
|
||||||
.search-form input[type="text"] {
|
|
||||||
padding: 8px 12px;
|
|
||||||
border: 1px solid #bbdefb; /* 统一边框颜色 */
|
|
||||||
border-radius: 4px;
|
|
||||||
width: 300px;
|
|
||||||
margin-right: 10px;
|
|
||||||
background-color: #fff;
|
|
||||||
}
|
|
||||||
|
|
||||||
.search-form input[type="submit"] {
|
|
||||||
padding: 8px 16px;
|
|
||||||
background-color: #3498db;
|
|
||||||
color: white;
|
|
||||||
border: none;
|
|
||||||
border-radius: 4px;
|
|
||||||
cursor: pointer;
|
|
||||||
}
|
|
||||||
|
|
||||||
.search-form input[type="submit"]:hover {
|
|
||||||
background-color: #2980b9;
|
|
||||||
}
|
|
||||||
|
|
||||||
.search-info {
|
|
||||||
color: #78909c; /* 统一搜索信息颜色 */
|
|
||||||
font-size: 0.9em;
|
|
||||||
margin-bottom: 10px;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 新增:左侧筛选栏样式 */
|
|
||||||
.content-wrapper {
|
|
||||||
display: flex;
|
|
||||||
gap: 20px;
|
|
||||||
}
|
|
||||||
|
|
||||||
.sidebar {
|
|
||||||
flex: 0 0 200px;
|
|
||||||
background-color: #e3f2fd; /* 统一边栏背景色调 */
|
|
||||||
border-radius: 5px;
|
|
||||||
padding: 15px;
|
|
||||||
}
|
|
||||||
|
|
||||||
.main-content {
|
|
||||||
flex: 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
.sidebar .filters {
|
|
||||||
margin-bottom: 20px;
|
|
||||||
padding: 0;
|
|
||||||
background-color: transparent;
|
|
||||||
}
|
|
||||||
|
|
||||||
.sidebar .filters strong {
|
|
||||||
display: block;
|
|
||||||
margin-bottom: 10px;
|
|
||||||
color: #2c3e50;
|
|
||||||
}
|
|
||||||
|
|
||||||
.sidebar .filters a {
|
|
||||||
display: block;
|
|
||||||
padding: 8px 10px;
|
|
||||||
margin: 0 0 5px 0;
|
|
||||||
background-color: #bbdefb; /* 统一边栏链接背景色调 */
|
|
||||||
color: #0d47a1;
|
|
||||||
text-decoration: none;
|
|
||||||
border-radius: 3px;
|
|
||||||
}
|
|
||||||
|
|
||||||
.sidebar .filters a.active {
|
|
||||||
background-color: #3498db;
|
|
||||||
color: white;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 新增:导出功能样式 */
|
|
||||||
.export-section {
|
|
||||||
margin-bottom: 20px;
|
|
||||||
padding: 15px;
|
|
||||||
background-color: #e8f5e9; /* 统一导出区域背景色调 */
|
|
||||||
border-radius: 5px;
|
|
||||||
text-align: center;
|
|
||||||
}
|
|
||||||
|
|
||||||
.export-btn {
|
|
||||||
padding: 10px 20px;
|
|
||||||
background-color: #4caf50; /* 统一按钮背景色调 */
|
|
||||||
color: white;
|
|
||||||
border: none;
|
|
||||||
border-radius: 4px;
|
|
||||||
cursor: pointer;
|
|
||||||
font-size: 16px;
|
|
||||||
margin: 0 5px;
|
|
||||||
}
|
|
||||||
|
|
||||||
.export-btn:hover {
|
|
||||||
background-color: #388e3c; /* 统一按钮悬停色调 */
|
|
||||||
}
|
|
||||||
|
|
||||||
.export-btn:disabled {
|
|
||||||
background-color: #9e9e9e; /* 统一禁用按钮色调 */
|
|
||||||
cursor: not-allowed;
|
|
||||||
}
|
|
||||||
|
|
||||||
.article-checkbox {
|
|
||||||
margin-right: 10px;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 新增:爬虫控制按钮样式 */
|
|
||||||
.crawler-control {
|
|
||||||
margin-bottom: 20px;
|
|
||||||
padding: 15px;
|
|
||||||
background-color: #fff3e0; /* 统一爬虫控制区域背景色调 */
|
|
||||||
border-radius: 5px;
|
|
||||||
text-align: center;
|
|
||||||
}
|
|
||||||
|
|
||||||
.crawler-btn {
|
|
||||||
padding: 10px 20px;
|
|
||||||
background-color: #ff9800; /* 统一爬虫按钮背景色调 */
|
|
||||||
color: white;
|
|
||||||
border: none;
|
|
||||||
border-radius: 4px;
|
|
||||||
cursor: pointer;
|
|
||||||
font-size: 16px;
|
|
||||||
margin: 0 5px;
|
|
||||||
}
|
|
||||||
|
|
||||||
.crawler-btn:hover {
|
|
||||||
background-color: #f57c00; /* 统一爬虫按钮悬停色调 */
|
|
||||||
}
|
|
||||||
|
|
||||||
.crawler-btn:disabled {
|
|
||||||
background-color: #9e9e9e; /* 统一禁用爬虫按钮色调 */
|
|
||||||
cursor: not-allowed;
|
|
||||||
}
|
|
||||||
|
|
||||||
.crawler-result {
|
|
||||||
margin-top: 10px;
|
|
||||||
padding: 10px;
|
padding: 10px;
|
||||||
border-radius: 4px;
|
|
||||||
display: none;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
.crawler-result.success {
|
.container {
|
||||||
background-color: #e8f5e9;
|
padding: 15px;
|
||||||
color: #2e7d32;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
.crawler-result.error {
|
|
||||||
background-color: #ffebee;
|
|
||||||
color: #c62828;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 新增:进度条样式 */
|
|
||||||
.progress-container {
|
|
||||||
margin-top: 10px;
|
|
||||||
display: none;
|
|
||||||
}
|
|
||||||
|
|
||||||
.progress-bar {
|
|
||||||
width: 100%;
|
|
||||||
height: 20px;
|
|
||||||
background-color: #e0e0e0;
|
|
||||||
border-radius: 10px;
|
|
||||||
overflow: hidden;
|
|
||||||
}
|
|
||||||
|
|
||||||
.progress-fill {
|
|
||||||
height: 100%;
|
|
||||||
background-color: #4caf50;
|
|
||||||
width: 0%;
|
|
||||||
transition: width 0.3s ease;
|
|
||||||
}
|
|
||||||
|
|
||||||
.progress-text {
|
|
||||||
margin-top: 5px;
|
|
||||||
font-size: 14px;
|
|
||||||
color: #666;
|
|
||||||
}
|
}
|
||||||
</style>
|
</style>
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
<div class="container">
|
<div class="container">
|
||||||
<h1>绿色课堂文章列表</h1>
|
<a href="{% url 'article_list' %}" class="back-link">« 返回文章列表</a>
|
||||||
|
|
||||||
<!-- 新增:搜索表单 -->
|
<h1>{{ article.title }}</h1>
|
||||||
<div class="search-form">
|
|
||||||
<form method="get">
|
<div class="meta">
|
||||||
<input type="text" name="q" placeholder="输入关键词搜索文章..." value="{{ search_query }}">
|
网站: {{ article.website.name }} |
|
||||||
{% if selected_website %}
|
发布时间: {{ article.pub_date|date:"Y-m-d H:i" }} |
|
||||||
<input type="hidden" name="website" value="{{ selected_website.id }}">
|
创建时间: {{ article.created_at|date:"Y-m-d H:i" }}
|
||||||
{% endif %}
|
|
||||||
<input type="submit" value="搜索">
|
|
||||||
</form>
|
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="content-wrapper">
|
<div class="content">
|
||||||
<!-- 左侧筛选栏 -->
|
{{ article.content|safe }}
|
||||||
<div class="sidebar">
|
|
||||||
<div class="filters">
|
|
||||||
<strong>按网站筛选:</strong>
|
|
||||||
<a href="{% url 'article_list' %}{% if search_query %}?q={{ search_query }}{% endif %}"
|
|
||||||
{% if not selected_website %}class="active" {% endif %}>全部</a>
|
|
||||||
{% for website in websites %}
|
|
||||||
<a href="?website={{ website.id }}{% if search_query %}&q={{ search_query }}{% endif %}"
|
|
||||||
{% if selected_website and selected_website.id == website.id %}class="active" {% endif %}>{{ website.name }}</a>
|
|
||||||
{% endfor %}
|
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<!-- 主内容区域 -->
|
|
||||||
<div class="main-content">
|
|
||||||
<!-- 新增:搜索结果信息 -->
|
|
||||||
{% if search_query %}
|
|
||||||
<div class="search-info">
|
|
||||||
搜索 "{{ search_query }}" 找到 {{ page_obj.paginator.count }} 篇文章
|
|
||||||
<a href="{% if selected_website %}?website={{ selected_website.id }}{% else %}{% url 'article_list' %}{% endif %}">清除搜索</a>
|
|
||||||
</div>
|
|
||||||
{% endif %}
|
|
||||||
|
|
||||||
<!-- 新增:导出功能 -->
|
|
||||||
<div class="export-section">
|
|
||||||
<button id="selectAllBtn" class="export-btn">全选</button>
|
|
||||||
<button id="deselectAllBtn" class="export-btn">取消全选</button>
|
|
||||||
<button id="exportJsonBtn" class="export-btn" disabled>导出为JSON</button>
|
|
||||||
<button id="exportCsvBtn" class="export-btn" disabled>导出为CSV</button>
|
|
||||||
<!-- 新增:导出为ZIP包按钮 -->
|
|
||||||
<button id="exportZipBtn" class="export-btn" disabled>导出为ZIP包</button>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<ul>
|
|
||||||
{% for article in page_obj %}
|
|
||||||
<li>
|
|
||||||
<input type="checkbox" class="article-checkbox" value="{{ article.id }}"
|
|
||||||
id="article_{{ article.id }}">
|
|
||||||
<a href="{% url 'article_detail' article.id %}">{{ article.title }}</a>
|
|
||||||
<div class="meta">({{ article.website.name }} - {{ article.created_at|date:"Y-m-d" }})</div>
|
|
||||||
</li>
|
|
||||||
{% empty %}
|
|
||||||
<li>暂无文章</li>
|
|
||||||
{% endfor %}
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
<div class="pagination">
|
|
||||||
{% if page_obj.has_previous %}
|
|
||||||
{% if selected_website %}
|
|
||||||
<a href="?website=
|
|
||||||
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page=1">«
|
|
||||||
首页</a>
|
|
||||||
<a href="?website=
|
|
||||||
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.previous_page_number }}">上一页</a>
|
|
||||||
{% else %}
|
|
||||||
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page=1">« 首页</a>
|
|
||||||
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.previous_page_number }}">上一页</a>
|
|
||||||
{% endif %}
|
|
||||||
{% endif %}
|
|
||||||
|
|
||||||
<span>第 {{ page_obj.number }} 页,共 {{ page_obj.paginator.num_pages }} 页</span>
|
|
||||||
|
|
||||||
<!-- 修改:优化页码显示逻辑 -->
|
|
||||||
{% with page_obj.paginator as paginator %}
|
|
||||||
{% for num in paginator.page_range %}
|
|
||||||
{% if page_obj.number == num %}
|
|
||||||
<a href="#" class="current">{{ num }}</a>
|
|
||||||
{% elif num > page_obj.number|add:'-3' and num < page_obj.number|add:'3' %}
|
|
||||||
{% if selected_website %}
|
|
||||||
<a href="?website=
|
|
||||||
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ num }}">{{ num }}</a>
|
|
||||||
{% else %}
|
|
||||||
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ num }}">{{ num }}</a>
|
|
||||||
{% endif %}
|
|
||||||
{% elif num == 1 or num == paginator.num_pages %}
|
|
||||||
{% if selected_website %}
|
|
||||||
<a href="?website=
|
|
||||||
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ num }}">{{ num }}</a>
|
|
||||||
{% else %}
|
|
||||||
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ num }}">{{ num }}</a>
|
|
||||||
{% endif %}
|
|
||||||
{% elif num == page_obj.number|add:'-3' or num == page_obj.number|add:'3' %}
|
|
||||||
<span class="ellipsis">...</span>
|
|
||||||
{% endif %}
|
|
||||||
{% endfor %}
|
|
||||||
{% endwith %}
|
|
||||||
|
|
||||||
{% if page_obj.has_next %}
|
|
||||||
{% if selected_website %}
|
|
||||||
<a href="?website=
|
|
||||||
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.next_page_number }}">下一页</a>
|
|
||||||
<a href="?website=
|
|
||||||
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.paginator.num_pages }}">末页
|
|
||||||
»</a>
|
|
||||||
{% else %}
|
|
||||||
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.next_page_number }}">下一页</a>
|
|
||||||
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.paginator.num_pages }}">末页
|
|
||||||
»</a>
|
|
||||||
{% endif %}
|
|
||||||
{% endif %}
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<script>
|
|
||||||
// 存储当前任务ID和检查状态的定时器
|
|
||||||
let currentTaskId = null;
|
|
||||||
let statusCheckInterval = null;
|
|
||||||
|
|
||||||
// 获取页面元素
|
|
||||||
const runCrawlerBtn = document.getElementById('runCrawlerBtn');
|
|
||||||
const runDongfangyancaoCrawlerBtn = document.getElementById('runDongfangyancaoCrawlerBtn');
|
|
||||||
const pauseCrawlerBtn = document.getElementById('pauseCrawlerBtn');
|
|
||||||
const progressContainer = document.getElementById('crawlerProgress');
|
|
||||||
const progressFill = document.getElementById('progressFill');
|
|
||||||
const progressText = document.getElementById('progressText');
|
|
||||||
const resultDiv = document.getElementById('crawlerResult');
|
|
||||||
|
|
||||||
// 绑定爬虫按钮事件
|
|
||||||
runCrawlerBtn.addEventListener('click', function () {
|
|
||||||
runCrawler('www.news.cn', 'crawl_xinhua');
|
|
||||||
});
|
|
||||||
|
|
||||||
runDongfangyancaoCrawlerBtn.addEventListener('click', function () {
|
|
||||||
runCrawler('东方烟草报', 'crawl_dongfangyancao');
|
|
||||||
});
|
|
||||||
|
|
||||||
// 暂停按钮事件
|
|
||||||
pauseCrawlerBtn.addEventListener('click', function () {
|
|
||||||
if (currentTaskId) {
|
|
||||||
pauseCrawler(currentTaskId);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
// 运行爬虫函数
|
|
||||||
function runCrawler(websiteName, crawlerName) {
|
|
||||||
// 禁用按钮,防止重复点击
|
|
||||||
runCrawlerBtn.disabled = true;
|
|
||||||
runDongfangyancaoCrawlerBtn.disabled = true;
|
|
||||||
resultDiv.style.display = 'none';
|
|
||||||
|
|
||||||
// 显示进度区域
|
|
||||||
progressContainer.style.display = 'block';
|
|
||||||
updateProgress(0, '爬虫启动中...');
|
|
||||||
|
|
||||||
// 发送POST请求运行爬虫
|
|
||||||
fetch('{% url "run_crawler" %}', {
|
|
||||||
method: 'POST',
|
|
||||||
headers: {
|
|
||||||
'Content-Type': 'application/x-www-form-urlencoded',
|
|
||||||
'X-CSRFToken': '{{ csrf_token }}'
|
|
||||||
},
|
|
||||||
body: 'crawler_name=' + crawlerName
|
|
||||||
})
|
|
||||||
.then(response => response.json())
|
|
||||||
.then(data => {
|
|
||||||
if (data.status === 'success') {
|
|
||||||
currentTaskId = data.task_id;
|
|
||||||
pauseCrawlerBtn.disabled = false;
|
|
||||||
// 启动轮询检查爬虫状态
|
|
||||||
checkCrawlerStatus(currentTaskId);
|
|
||||||
} else {
|
|
||||||
// 显示错误信息
|
|
||||||
resultDiv.style.display = 'block';
|
|
||||||
resultDiv.className = 'crawler-result error';
|
|
||||||
resultDiv.textContent = data.message;
|
|
||||||
// 恢复按钮状态
|
|
||||||
runCrawlerBtn.disabled = false;
|
|
||||||
runDongfangyancaoCrawlerBtn.disabled = false;
|
|
||||||
progressContainer.style.display = 'none';
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.catch(error => {
|
|
||||||
// 显示错误信息
|
|
||||||
resultDiv.style.display = 'block';
|
|
||||||
resultDiv.className = 'crawler-result error';
|
|
||||||
resultDiv.textContent = '请求失败: ' + error;
|
|
||||||
// 恢复按钮状态
|
|
||||||
runCrawlerBtn.disabled = false;
|
|
||||||
runDongfangyancaoCrawlerBtn.disabled = false;
|
|
||||||
progressContainer.style.display = 'none';
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
// 检查爬虫状态的函数
|
|
||||||
function checkCrawlerStatus(taskId) {
|
|
||||||
// 清除之前的定时器
|
|
||||||
if (statusCheckInterval) {
|
|
||||||
clearInterval(statusCheckInterval);
|
|
||||||
}
|
|
||||||
|
|
||||||
// 设置新的定时器,每秒检查一次状态
|
|
||||||
statusCheckInterval = setInterval(() => {
|
|
||||||
fetch('{% url "crawler_status" %}', {
|
|
||||||
method: 'POST',
|
|
||||||
headers: {
|
|
||||||
'Content-Type': 'application/x-www-form-urlencoded',
|
|
||||||
'X-CSRFToken': '{{ csrf_token }}'
|
|
||||||
},
|
|
||||||
body: 'task_id=' + taskId
|
|
||||||
})
|
|
||||||
.then(response => response.json())
|
|
||||||
.then(data => {
|
|
||||||
if (data.status === 'running') {
|
|
||||||
// 更新进度信息(模拟进度)
|
|
||||||
const elapsedTime = new Date() - new Date(data.start_time);
|
|
||||||
const progress = Math.min(90, Math.floor(elapsedTime / 1000));
|
|
||||||
updateProgress(progress, data.message);
|
|
||||||
} else if (data.status === 'completed') {
|
|
||||||
// 爬虫完成,显示结果
|
|
||||||
clearInterval(statusCheckInterval);
|
|
||||||
updateProgress(100, data.message);
|
|
||||||
|
|
||||||
// 恢复按钮状态
|
|
||||||
runCrawlerBtn.disabled = false;
|
|
||||||
runDongfangyancaoCrawlerBtn.disabled = false;
|
|
||||||
pauseCrawlerBtn.disabled = true;
|
|
||||||
|
|
||||||
// 显示结果信息
|
|
||||||
resultDiv.style.display = 'block';
|
|
||||||
resultDiv.className = 'crawler-result success';
|
|
||||||
resultDiv.textContent = data.message;
|
|
||||||
|
|
||||||
// 3秒后自动隐藏进度条
|
|
||||||
setTimeout(() => {
|
|
||||||
progressContainer.style.display = 'none';
|
|
||||||
}, 3000);
|
|
||||||
|
|
||||||
// 自动刷新页面以显示新文章
|
|
||||||
setTimeout(() => {
|
|
||||||
location.reload();
|
|
||||||
}, 2000);
|
|
||||||
} else if (data.status === 'paused') {
|
|
||||||
// 爬虫暂停
|
|
||||||
clearInterval(statusCheckInterval);
|
|
||||||
updateProgress(data.progress || 0, '爬虫已暂停');
|
|
||||||
|
|
||||||
// 恢复按钮状态
|
|
||||||
runCrawlerBtn.disabled = false;
|
|
||||||
runDongfangyancaoCrawlerBtn.disabled = false;
|
|
||||||
pauseCrawlerBtn.disabled = true;
|
|
||||||
|
|
||||||
// 显示结果信息
|
|
||||||
resultDiv.style.display = 'block';
|
|
||||||
resultDiv.className = 'crawler-result success';
|
|
||||||
resultDiv.textContent = '爬虫已暂停';
|
|
||||||
} else if (data.status === 'error') {
|
|
||||||
// 爬虫出错
|
|
||||||
clearInterval(statusCheckInterval);
|
|
||||||
progressContainer.style.display = 'none';
|
|
||||||
resultDiv.style.display = 'block';
|
|
||||||
resultDiv.className = 'crawler-result error';
|
|
||||||
resultDiv.textContent = data.message;
|
|
||||||
|
|
||||||
// 恢复按钮状态
|
|
||||||
runCrawlerBtn.disabled = false;
|
|
||||||
runDongfangyancaoCrawlerBtn.disabled = false;
|
|
||||||
pauseCrawlerBtn.disabled = true;
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.catch(error => {
|
|
||||||
clearInterval(statusCheckInterval);
|
|
||||||
progressContainer.style.display = 'none';
|
|
||||||
resultDiv.style.display = 'block';
|
|
||||||
resultDiv.className = 'crawler-result error';
|
|
||||||
resultDiv.textContent = '检查状态失败: ' + error;
|
|
||||||
|
|
||||||
// 恢复按钮状态
|
|
||||||
runCrawlerBtn.disabled = false;
|
|
||||||
runDongfangyancaoCrawlerBtn.disabled = false;
|
|
||||||
pauseCrawlerBtn.disabled = true;
|
|
||||||
});
|
|
||||||
}, 1000);
|
|
||||||
}
|
|
||||||
|
|
||||||
// 更新进度条函数
|
|
||||||
function updateProgress(percent, text) {
|
|
||||||
progressFill.style.width = percent + '%';
|
|
||||||
progressText.textContent = text;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 暂停爬虫函数
|
|
||||||
function pauseCrawler(taskId) {
|
|
||||||
fetch('{% url "pause_crawler" %}', {
|
|
||||||
method: 'POST',
|
|
||||||
headers: {
|
|
||||||
'Content-Type': 'application/x-www-form-urlencoded',
|
|
||||||
'X-CSRFToken': '{{ csrf_token }}'
|
|
||||||
},
|
|
||||||
body: 'task_id=' + taskId
|
|
||||||
})
|
|
||||||
.then(response => response.json())
|
|
||||||
.then(data => {
|
|
||||||
if (data.status === 'success') {
|
|
||||||
// 暂停成功,更新UI
|
|
||||||
pauseCrawlerBtn.disabled = true;
|
|
||||||
updateProgress(data.progress || 0, '正在暂停...');
|
|
||||||
} else {
|
|
||||||
// 显示错误信息
|
|
||||||
resultDiv.style.display = 'block';
|
|
||||||
resultDiv.className = 'crawler-result error';
|
|
||||||
resultDiv.textContent = data.message;
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.catch(error => {
|
|
||||||
// 显示错误信息
|
|
||||||
resultDiv.style.display = 'block';
|
|
||||||
resultDiv.className = 'crawler-result error';
|
|
||||||
resultDiv.textContent = '暂停请求失败: ' + error;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
// 导出功能相关JavaScript
|
|
||||||
const checkboxes = document.querySelectorAll('.article-checkbox');
|
|
||||||
const exportJsonBtn = document.getElementById('exportJsonBtn');
|
|
||||||
const exportCsvBtn = document.getElementById('exportCsvBtn');
|
|
||||||
const selectAllBtn = document.getElementById('selectAllBtn');
|
|
||||||
const deselectAllBtn = document.getElementById('deselectAllBtn');
|
|
||||||
// 新增:获取ZIP导出按钮元素
|
|
||||||
const exportZipBtn = document.getElementById('exportZipBtn');
|
|
||||||
|
|
||||||
// 更新导出按钮状态
|
|
||||||
function updateExportButtons() {
|
|
||||||
const selectedCount = document.querySelectorAll('.article-checkbox:checked').length;
|
|
||||||
exportJsonBtn.disabled = selectedCount === 0;
|
|
||||||
exportCsvBtn.disabled = selectedCount === 0;
|
|
||||||
exportZipBtn.disabled = selectedCount === 0; // 新增:更新ZIP导出按钮状态
|
|
||||||
}
|
|
||||||
|
|
||||||
// 为所有复选框添加事件监听器
|
|
||||||
checkboxes.forEach(checkbox => {
|
|
||||||
checkbox.addEventListener('change', updateExportButtons);
|
|
||||||
});
|
|
||||||
|
|
||||||
// 全选功能
|
|
||||||
selectAllBtn.addEventListener('click', () => {
|
|
||||||
checkboxes.forEach(checkbox => {
|
|
||||||
checkbox.checked = true;
|
|
||||||
});
|
|
||||||
updateExportButtons();
|
|
||||||
});
|
|
||||||
|
|
||||||
// 取消全选功能
|
|
||||||
deselectAllBtn.addEventListener('click', () => {
|
|
||||||
checkboxes.forEach(checkbox => {
|
|
||||||
checkbox.checked = false;
|
|
||||||
});
|
|
||||||
updateExportButtons();
|
|
||||||
});
|
|
||||||
|
|
||||||
// 导出为JSON功能
|
|
||||||
exportJsonBtn.addEventListener('click', () => {
|
|
||||||
const selectedArticles = Array.from(document.querySelectorAll('.article-checkbox:checked'))
|
|
||||||
.map(checkbox => checkbox.value);
|
|
||||||
|
|
||||||
// 发送POST请求导出文章
|
|
||||||
fetch('{% url "export_articles" %}', {
|
|
||||||
method: 'POST',
|
|
||||||
headers: {
|
|
||||||
'Content-Type': 'application/json',
|
|
||||||
'X-CSRFToken': '{{ csrf_token }}'
|
|
||||||
},
|
|
||||||
body: JSON.stringify({
|
|
||||||
article_ids: selectedArticles,
|
|
||||||
format: 'json'
|
|
||||||
})
|
|
||||||
})
|
|
||||||
.then(response => {
|
|
||||||
if (response.ok) {
|
|
||||||
return response.blob();
|
|
||||||
}
|
|
||||||
throw new Error('导出失败');
|
|
||||||
})
|
|
||||||
.then(blob => {
|
|
||||||
const url = window.URL.createObjectURL(blob);
|
|
||||||
const a = document.createElement('a');
|
|
||||||
a.href = url;
|
|
||||||
a.download = 'articles.json';
|
|
||||||
document.body.appendChild(a);
|
|
||||||
a.click();
|
|
||||||
window.URL.revokeObjectURL(url);
|
|
||||||
document.body.removeChild(a);
|
|
||||||
})
|
|
||||||
.catch(error => {
|
|
||||||
alert('导出失败: ' + error);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
// 导出为CSV功能
|
|
||||||
exportCsvBtn.addEventListener('click', () => {
|
|
||||||
const selectedArticles = Array.from(document.querySelectorAll('.article-checkbox:checked'))
|
|
||||||
.map(checkbox => checkbox.value);
|
|
||||||
|
|
||||||
// 发送POST请求导出文章
|
|
||||||
fetch('{% url "export_articles" %}', {
|
|
||||||
method: 'POST',
|
|
||||||
headers: {
|
|
||||||
'Content-Type': 'application/json',
|
|
||||||
'X-CSRFToken': '{{ csrf_token }}'
|
|
||||||
},
|
|
||||||
body: JSON.stringify({
|
|
||||||
article_ids: selectedArticles,
|
|
||||||
format: 'csv'
|
|
||||||
})
|
|
||||||
})
|
|
||||||
.then(response => {
|
|
||||||
if (response.ok) {
|
|
||||||
return response.blob();
|
|
||||||
}
|
|
||||||
throw new Error('导出失败');
|
|
||||||
})
|
|
||||||
.then(blob => {
|
|
||||||
const url = window.URL.createObjectURL(blob);
|
|
||||||
const a = document.createElement('a');
|
|
||||||
a.href = url;
|
|
||||||
a.download = 'articles.csv';
|
|
||||||
document.body.appendChild(a);
|
|
||||||
a.click();
|
|
||||||
window.URL.revokeObjectURL(url);
|
|
||||||
document.body.removeChild(a);
|
|
||||||
})
|
|
||||||
.catch(error => {
|
|
||||||
alert('导出失败: ' + error);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
// 新增:导出为ZIP包功能
|
|
||||||
exportZipBtn.addEventListener('click', () => {
|
|
||||||
const selectedArticles = Array.from(document.querySelectorAll('.article-checkbox:checked'))
|
|
||||||
.map(checkbox => checkbox.value);
|
|
||||||
|
|
||||||
// 发送POST请求导出文章为ZIP包
|
|
||||||
fetch('{% url "export_articles" %}', {
|
|
||||||
method: 'POST',
|
|
||||||
headers: {
|
|
||||||
'Content-Type': 'application/json',
|
|
||||||
'X-CSRFToken': '{{ csrf_token }}'
|
|
||||||
},
|
|
||||||
body: JSON.stringify({
|
|
||||||
article_ids: selectedArticles,
|
|
||||||
format: 'zip' // 指定导出格式为ZIP
|
|
||||||
})
|
|
||||||
})
|
|
||||||
.then(response => {
|
|
||||||
if (response.ok) {
|
|
||||||
return response.blob();
|
|
||||||
}
|
|
||||||
throw new Error('导出失败');
|
|
||||||
})
|
|
||||||
.then(blob => {
|
|
||||||
const url = window.URL.createObjectURL(blob);
|
|
||||||
const a = document.createElement('a');
|
|
||||||
a.href = url;
|
|
||||||
a.download = 'articles.zip';
|
|
||||||
document.body.appendChild(a);
|
|
||||||
a.click();
|
|
||||||
window.URL.revokeObjectURL(url);
|
|
||||||
document.body.removeChild(a);
|
|
||||||
})
|
|
||||||
.catch(error => {
|
|
||||||
alert('导出失败: ' + error);
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
// 初始化导出按钮状态
|
|
||||||
updateExportButtons();
|
|
||||||
</script>
|
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
@@ -251,11 +251,9 @@
|
|||||||
<div class="sidebar">
|
<div class="sidebar">
|
||||||
<div class="filters">
|
<div class="filters">
|
||||||
<strong>按网站筛选:</strong>
|
<strong>按网站筛选:</strong>
|
||||||
<a href="{% url 'article_list' %}{% if search_query %}?q={{ search_query }}{% endif %}"
|
<a href="{% url 'article_list' %}{% if search_query %}?q={{ search_query }}{% endif %}" {% if not selected_website %}class="active" {% endif %}>全部</a>
|
||||||
{% if not selected_website %}class="active" {% endif %}>全部</a>
|
|
||||||
{% for website in websites %}
|
{% for website in websites %}
|
||||||
<a href="?website={{ website.id }}{% if search_query %}&q={{ search_query }}{% endif %}"
|
<a href="?website={{ website.id }}{% if search_query %}&q={{ search_query }}{% endif %}" {% if selected_website and selected_website.id == website.id %}class="active" {% endif %}>{{ website.name }}</a>
|
||||||
{% if selected_website and selected_website.id == website.id %}class="active" {% endif %}>{{ website.name }}</a>
|
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
@@ -283,8 +281,7 @@
|
|||||||
<ul>
|
<ul>
|
||||||
{% for article in page_obj %}
|
{% for article in page_obj %}
|
||||||
<li>
|
<li>
|
||||||
<input type="checkbox" class="article-checkbox" value="{{ article.id }}"
|
<input type="checkbox" class="article-checkbox" value="{{ article.id }}" id="article_{{ article.id }}">
|
||||||
id="article_{{ article.id }}">
|
|
||||||
<a href="{% url 'article_detail' article.id %}">{{ article.title }}</a>
|
<a href="{% url 'article_detail' article.id %}">{{ article.title }}</a>
|
||||||
<div class="meta">({{ article.website.name }} - {{ article.created_at|date:"Y-m-d" }})</div>
|
<div class="meta">({{ article.website.name }} - {{ article.created_at|date:"Y-m-d" }})</div>
|
||||||
</li>
|
</li>
|
||||||
@@ -296,11 +293,8 @@
|
|||||||
<div class="pagination">
|
<div class="pagination">
|
||||||
{% if page_obj.has_previous %}
|
{% if page_obj.has_previous %}
|
||||||
{% if selected_website %}
|
{% if selected_website %}
|
||||||
<a href="?website=
|
<a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page=1">« 首页</a>
|
||||||
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page=1">«
|
<a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.previous_page_number }}">上一页</a>
|
||||||
首页</a>
|
|
||||||
<a href="?website=
|
|
||||||
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.previous_page_number }}">上一页</a>
|
|
||||||
{% else %}
|
{% else %}
|
||||||
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page=1">« 首页</a>
|
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page=1">« 首页</a>
|
||||||
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.previous_page_number }}">上一页</a>
|
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.previous_page_number }}">上一页</a>
|
||||||
@@ -316,15 +310,13 @@
|
|||||||
<a href="#" class="current">{{ num }}</a>
|
<a href="#" class="current">{{ num }}</a>
|
||||||
{% elif num > page_obj.number|add:'-3' and num < page_obj.number|add:'3' %}
|
{% elif num > page_obj.number|add:'-3' and num < page_obj.number|add:'3' %}
|
||||||
{% if selected_website %}
|
{% if selected_website %}
|
||||||
<a href="?website=
|
<a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ num }}">{{ num }}</a>
|
||||||
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ num }}">{{ num }}</a>
|
|
||||||
{% else %}
|
{% else %}
|
||||||
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ num }}">{{ num }}</a>
|
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ num }}">{{ num }}</a>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
{% elif num == 1 or num == paginator.num_pages %}
|
{% elif num == 1 or num == paginator.num_pages %}
|
||||||
{% if selected_website %}
|
{% if selected_website %}
|
||||||
<a href="?website=
|
<a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ num }}">{{ num }}</a>
|
||||||
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ num }}">{{ num }}</a>
|
|
||||||
{% else %}
|
{% else %}
|
||||||
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ num }}">{{ num }}</a>
|
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ num }}">{{ num }}</a>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
@@ -336,15 +328,11 @@
|
|||||||
|
|
||||||
{% if page_obj.has_next %}
|
{% if page_obj.has_next %}
|
||||||
{% if selected_website %}
|
{% if selected_website %}
|
||||||
<a href="?website=
|
<a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.next_page_number }}">下一页</a>
|
||||||
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.next_page_number }}">下一页</a>
|
<a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.paginator.num_pages }}">末页 »</a>
|
||||||
<a href="?website=
|
|
||||||
{{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.paginator.num_pages }}">末页
|
|
||||||
»</a>
|
|
||||||
{% else %}
|
{% else %}
|
||||||
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.next_page_number }}">下一页</a>
|
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.next_page_number }}">下一页</a>
|
||||||
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.paginator.num_pages }}">末页
|
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.paginator.num_pages }}">末页 »</a>
|
||||||
»</a>
|
|
||||||
{% endif %}
|
{% endif %}
|
||||||
{% endif %}
|
{% endif %}
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
@@ -107,6 +107,17 @@ def process_article(url, website):
|
|||||||
soup.find("div", id="content") or
|
soup.find("div", id="content") or
|
||||||
soup.find("div", class_="mainBody")
|
soup.find("div", class_="mainBody")
|
||||||
)
|
)
|
||||||
|
elif website.name == "人民日报":
|
||||||
|
# 人民日报网站的文章结构处理
|
||||||
|
title_tag = soup.find("h1") or soup.find("title")
|
||||||
|
# 查找主要内容区域
|
||||||
|
content_tag = (
|
||||||
|
soup.find("div", class_="content") or
|
||||||
|
soup.find("div", class_="article-content") or
|
||||||
|
soup.find("div", id="content") or
|
||||||
|
soup.find("div", class_="text") or
|
||||||
|
soup.find("section", class_="content")
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
# 默认处理方式
|
# 默认处理方式
|
||||||
title_tag = soup.find("h1") or soup.find("title")
|
title_tag = soup.find("h1") or soup.find("title")
|
||||||
@@ -256,6 +267,24 @@ def full_site_crawler(start_url, website, max_pages=1000):
|
|||||||
("/xinwen/" in path) or
|
("/xinwen/" in path) or
|
||||||
("/huoban/" in path)
|
("/huoban/" in path)
|
||||||
)
|
)
|
||||||
|
elif website.name == "人民日报":
|
||||||
|
# 人民日报的文章页面判断逻辑
|
||||||
|
parsed_url = urlparse(url)
|
||||||
|
path = parsed_url.path
|
||||||
|
# 修改: 增加更准确的文章页面判断逻辑
|
||||||
|
is_article_page = (
|
||||||
|
(soup.find("div", class_="content") is not None and
|
||||||
|
soup.find("h1") is not None) or
|
||||||
|
soup.find("div", class_="article-content") is not None or
|
||||||
|
(soup.find("div", id="content") is not None and
|
||||||
|
soup.find("h1") is not None) or
|
||||||
|
soup.find("div", class_="text") is not None or
|
||||||
|
soup.find("section", class_="content") is not None or
|
||||||
|
("/article/" in path) or
|
||||||
|
(path.startswith("/detail/") and len(path) > 10) or
|
||||||
|
# 增加对peopleapp.com特定文章路径的判断
|
||||||
|
("/dynamic/" in path and "article" in path)
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
# 默认判断逻辑
|
# 默认判断逻辑
|
||||||
is_article_page = (
|
is_article_page = (
|
||||||
@@ -271,5 +300,18 @@ def full_site_crawler(start_url, website, max_pages=1000):
|
|||||||
# 扩展队列,发现新链接
|
# 扩展队列,发现新链接
|
||||||
for link in soup.find_all("a", href=True):
|
for link in soup.find_all("a", href=True):
|
||||||
href = urljoin(url, link["href"])
|
href = urljoin(url, link["href"])
|
||||||
if href not in visited and is_valid_url(href, base_netloc):
|
# 对于人民日报网站,我们扩展链接发现逻辑
|
||||||
|
if website.name == "人民日报":
|
||||||
|
# 允许爬取以https://www.peopleapp.com/开头的链接
|
||||||
|
if href.startswith("https://www.peopleapp.com/") and href not in visited:
|
||||||
|
# 增加对文章链接的识别
|
||||||
|
parsed_href = urlparse(href)
|
||||||
|
href_path = parsed_href.path
|
||||||
|
# 添加更多可能的文章链接模式
|
||||||
|
if ("/article/" in href_path or
|
||||||
|
href_path.startswith("/detail/") or
|
||||||
|
("/dynamic/" in href_path and "article" in href_path) or
|
||||||
|
href_path.count("/") > 2): # 更深层的页面可能是文章页
|
||||||
|
queue.append(href)
|
||||||
|
elif href not in visited and is_valid_url(href, base_netloc):
|
||||||
queue.append(href)
|
queue.append(href)
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ def article_list(request):
|
|||||||
|
|
||||||
# 获取筛选网站
|
# 获取筛选网站
|
||||||
selected_website = None
|
selected_website = None
|
||||||
|
# 修改:确保始终获取所有文章,除非有特定筛选
|
||||||
articles = Article.objects.all()
|
articles = Article.objects.all()
|
||||||
|
|
||||||
website_id = request.GET.get('website')
|
website_id = request.GET.get('website')
|
||||||
|
|||||||
Reference in New Issue
Block a user