Add Support dongfangyaocao

This commit is contained in:
2025-08-11 22:20:19 +08:00
parent 6d80326a4e
commit d9d2ea9d99
11 changed files with 686 additions and 58 deletions

View File

@@ -1,11 +1,234 @@
from django.contrib import admin from django.contrib import admin
from django.contrib.admin import AdminSite
from .models import Website, Article from .models import Website, Article
# 添加actions相关的导入
from django.contrib import messages
from django.http import HttpResponseRedirect
# 添加导出功能所需导入
import csv
from django.http import HttpResponse
import json
# 创建自定义管理站点
class NewsCnAdminSite(AdminSite):
site_header = "新华网管理后台"
site_title = "新华网管理"
index_title = "新华网内容管理"
class DongfangyancaoAdminSite(AdminSite):
site_header = "东方烟草报管理后台"
site_title = "东方烟草报管理"
index_title = "东方烟草报内容管理"
# 实例化管理站点
news_cn_admin = NewsCnAdminSite(name='news_cn_admin')
dongfangyancao_admin = DongfangyancaoAdminSite(name='dongfangyancao_admin')
@admin.register(Website) @admin.register(Website)
class WebsiteAdmin(admin.ModelAdmin): class WebsiteAdmin(admin.ModelAdmin):
list_display = ('name', 'base_url', 'enabled') list_display = ('name', 'base_url', 'enabled')
# 为ArticleAdmin添加自定义动作
@admin.register(Article) @admin.register(Article)
class ArticleAdmin(admin.ModelAdmin): class ArticleAdmin(admin.ModelAdmin):
list_display = ('title', 'website', 'pub_date') list_display = ('title', 'website', 'pub_date')
search_fields = ('title', 'content') search_fields = ('title', 'content')
# 添加动作选项
actions = ['delete_selected_articles', 'delete_dongfangyancao_articles', 'export_as_csv', 'export_as_json']
def delete_dongfangyancao_articles(self, request, queryset):
"""一键删除东方烟草报的所有文章"""
# 获取东方烟草报网站对象
try:
dongfangyancao_website = Website.objects.get(name='东方烟草报')
# 删除所有东方烟草报的文章
deleted_count = Article.objects.filter(website=dongfangyancao_website).delete()[0]
self.message_user(request, f"成功删除 {deleted_count} 篇东方烟草报文章", messages.SUCCESS)
except Website.DoesNotExist:
self.message_user(request, "未找到东方烟草报网站配置", messages.ERROR)
# 设置动作的显示名称
delete_dongfangyancao_articles.short_description = "删除所有东方烟草报文章"
def export_as_csv(self, request, queryset):
"""导出选中的文章为CSV格式"""
meta = self.model._meta
field_names = [field.name for field in meta.fields]
response = HttpResponse(content_type='text/csv')
response['Content-Disposition'] = 'attachment; filename={}.csv'.format(meta)
writer = csv.writer(response)
writer.writerow(field_names)
for obj in queryset:
row = [getattr(obj, field)() if callable(getattr(obj, field)) else getattr(obj, field) for field in field_names]
writer.writerow(row)
return response
export_as_csv.short_description = "导出选中文章为CSV格式"
def export_as_json(self, request, queryset):
"""导出选中的文章为JSON格式"""
response = HttpResponse(content_type='application/json')
response['Content-Disposition'] = 'attachment; filename=articles.json'
# 构造要导出的数据
articles_data = []
for article in queryset:
articles_data.append({
'id': article.id,
'title': article.title,
'website': article.website.name,
'url': article.url,
'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None,
'content': article.content,
'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'),
'media_files': article.media_files
})
# 写入JSON数据
response.write(json.dumps(articles_data, ensure_ascii=False, indent=2))
return response
export_as_json.short_description = "导出选中文章为JSON格式"
# 为不同网站创建专门的文章管理类
class NewsCnArticleAdmin(admin.ModelAdmin):
list_display = ('title', 'pub_date')
search_fields = ('title', 'content')
list_filter = ('pub_date',)
actions = ['export_as_csv', 'export_as_json']
def get_queryset(self, request):
qs = super().get_queryset(request)
# 只显示新华网的文章
return qs.filter(website__name='www.news.cn')
def export_as_csv(self, request, queryset):
"""导出选中的文章为CSV格式"""
meta = self.model._meta
field_names = [field.name for field in meta.fields if field.name != 'content'] # 排除content字段以减小CSV大小
response = HttpResponse(content_type='text/csv')
response['Content-Disposition'] = 'attachment; filename=news_cn_articles.csv'
writer = csv.writer(response)
writer.writerow(field_names)
for obj in queryset:
row = []
for field in field_names:
value = getattr(obj, field)
if callable(value):
value = value()
if field == 'website':
value = value.name
row.append(value)
writer.writerow(row)
return response
export_as_csv.short_description = "导出选中文章为CSV格式"
def export_as_json(self, request, queryset):
"""导出选中的文章为JSON格式"""
response = HttpResponse(content_type='application/json')
response['Content-Disposition'] = 'attachment; filename=news_cn_articles.json'
# 构造要导出的数据
articles_data = []
for article in queryset:
articles_data.append({
'id': article.id,
'title': article.title,
'website': article.website.name,
'url': article.url,
'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None,
'content': article.content,
'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'),
'media_files': article.media_files
})
# 写入JSON数据
response.write(json.dumps(articles_data, ensure_ascii=False, indent=2))
return response
export_as_json.short_description = "导出选中文章为JSON格式"
class DongfangyancaoArticleAdmin(admin.ModelAdmin):
list_display = ('title', 'pub_date')
search_fields = ('title', 'content')
list_filter = ('pub_date',)
# 添加动作选项
actions = ['delete_selected_articles', 'delete_all_articles', 'export_as_csv', 'export_as_json']
def get_queryset(self, request):
qs = super().get_queryset(request)
# 只显示东方烟草报的文章
return qs.filter(website__name='东方烟草报')
def delete_all_articles(self, request, queryset):
"""删除当前筛选的所有文章(东方烟草报的所有文章)"""
# 删除所有东方烟草报的文章
deleted_count = self.get_queryset(request).delete()[0]
self.message_user(request, f"成功删除 {deleted_count} 篇文章", messages.SUCCESS)
# 设置动作的显示名称
delete_all_articles.short_description = "删除所有当前筛选的文章"
def export_as_csv(self, request, queryset):
"""导出选中的文章为CSV格式"""
meta = self.model._meta
field_names = [field.name for field in meta.fields if field.name != 'content'] # 排除content字段以减小CSV大小
response = HttpResponse(content_type='text/csv')
response['Content-Disposition'] = 'attachment; filename=dongfangyancao_articles.csv'
writer = csv.writer(response)
writer.writerow(field_names)
for obj in queryset:
row = []
for field in field_names:
value = getattr(obj, field)
if callable(value):
value = value()
if field == 'website':
value = value.name
row.append(value)
writer.writerow(row)
return response
export_as_csv.short_description = "导出选中文章为CSV格式"
def export_as_json(self, request, queryset):
"""导出选中的文章为JSON格式"""
response = HttpResponse(content_type='application/json')
response['Content-Disposition'] = 'attachment; filename=dongfangyancao_articles.json'
# 构造要导出的数据
articles_data = []
for article in queryset:
articles_data.append({
'id': article.id,
'title': article.title,
'website': article.website.name,
'url': article.url,
'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None,
'content': article.content,
'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'),
'media_files': article.media_files
})
# 写入JSON数据
response.write(json.dumps(articles_data, ensure_ascii=False, indent=2))
return response
export_as_json.short_description = "导出选中文章为JSON格式"
# 在各自的管理站点中注册模型
news_cn_admin.register(Website, WebsiteAdmin)
news_cn_admin.register(Article, NewsCnArticleAdmin)
dongfangyancao_admin.register(Website, WebsiteAdmin)
dongfangyancao_admin.register(Article, DongfangyancaoArticleAdmin)

View File

@@ -1,20 +1,21 @@
# core/management/commands/crawl_full_site.py # core/management/commands/crawl_dongfangyancao.py
from django.core.management.base import BaseCommand from django.core.management.base import BaseCommand
from core.models import Website from core.models import Website
from core.utils import full_site_crawler from core.utils import full_site_crawler
class Command(BaseCommand): class Command(BaseCommand):
help = "全站递归爬取 www.news.cn" help = "全站递归爬取 东方烟草报"
def handle(self, *args, **kwargs): def handle(self, *args, **kwargs):
website, created = Website.objects.get_or_create( website, created = Website.objects.get_or_create(
name="www.news.cn", name="东方烟草报",
defaults={ defaults={
'article_list_url': 'https://www.news.cn/', 'article_list_url': 'https://www.eastobacco.com/',
'article_selector': 'a' 'article_selector': 'a'
} }
) )
start_url = "https://www.news.cn/" start_url = "https://www.eastobacco.com/"
self.stdout.write(f"开始全站爬取: {start_url}") self.stdout.write(f"开始全站爬取: {start_url}")
full_site_crawler(start_url, website, max_pages=500) full_site_crawler(start_url, website, max_pages=500)
self.stdout.write("爬取完成") self.stdout.write("爬取完成")

View File

@@ -1,18 +1,21 @@
# core/management/commands/crawl_xinhua.py
from django.core.management.base import BaseCommand from django.core.management.base import BaseCommand
from core.models import Website from core.models import Website
from core.utils import crawl_xinhua_list from core.utils import full_site_crawler
class Command(BaseCommand): class Command(BaseCommand):
help = '批量爬取新华网文章' help = "全站递归爬取 www.news.cn"
def handle(self, *args, **options): def handle(self, *args, **kwargs):
list_url = "https://www.news.cn/legal/index.html" website, created = Website.objects.get_or_create(
try: name="www.news.cn",
website = Website.objects.get(base_url="https://www.news.cn/") defaults={
except Website.DoesNotExist: 'article_list_url': 'https://www.news.cn/',
self.stdout.write(self.style.ERROR("网站 https://www.news.cn/ 不存在,请先后台添加")) 'article_selector': 'a'
return }
)
self.stdout.write(f"开始爬取文章列表页: {list_url}") start_url = "https://www.news.cn/"
crawl_xinhua_list(list_url, website) self.stdout.write(f"开始全站爬取: {start_url}")
self.stdout.write(self.style.SUCCESS("批量爬取完成")) full_site_crawler(start_url, website, max_pages=500)
self.stdout.write("爬取完成")

View File

@@ -0,0 +1,21 @@
from django.core.management.base import BaseCommand
from core.models import Website
from core.utils import crawl_xinhua_list
class Command(BaseCommand):
help = '批量爬取新华网文章'
def handle(self, *args, **options):
# 添加使用标记,确认该命令是否被调用
self.stdout.write(self.style.WARNING("crawl_xinhua command is being used"))
list_url = "https://www.news.cn/legal/index.html"
try:
website = Website.objects.get(base_url="https://www.news.cn/")
except Website.DoesNotExist:
self.stdout.write(self.style.ERROR("网站 https://www.news.cn/ 不存在,请先后台添加"))
return
self.stdout.write(f"开始爬取文章列表页: {list_url}")
crawl_xinhua_list(list_url, website)
self.stdout.write(self.style.SUCCESS("批量爬取完成"))

View File

@@ -0,0 +1,130 @@
from django.core.management.base import BaseCommand
from core.models import Article, Website
import json
import csv
import os
from django.conf import settings
from django.core.files.storage import default_storage
import zipfile
from django.utils import timezone
class Command(BaseCommand):
help = '导出文章及相关的媒体文件(图片、视频等)'
def add_arguments(self, parser):
parser.add_argument('--format', type=str, default='json', help='导出格式: json 或 csv')
parser.add_argument('--website', type=str, help='指定网站名称导出特定网站的文章')
parser.add_argument('--output', type=str, default='', help='输出文件路径')
parser.add_argument('--include-media', action='store_true', help='包含媒体文件')
def handle(self, *args, **options):
format_type = options['format'].lower()
website_name = options['website']
output_path = options['output']
include_media = options['include_media']
# 获取文章查询集
articles = Article.objects.all()
if website_name:
try:
website = Website.objects.get(name=website_name)
articles = articles.filter(website=website)
except Website.DoesNotExist:
self.stdout.write(self.style.ERROR(f'网站 "{website_name}" 不存在'))
return
if not articles.exists():
self.stdout.write(self.style.WARNING('没有找到文章'))
return
# 准备导出数据
articles_data = []
media_files = []
for article in articles:
article_data = {
'id': article.id,
'title': article.title,
'website': article.website.name,
'url': article.url,
'pub_date': article.pub_date.isoformat() if article.pub_date else None,
'content': article.content,
'created_at': article.created_at.isoformat(),
'media_files': article.media_files
}
articles_data.append(article_data)
# 收集媒体文件路径
if include_media:
for media_path in article.media_files:
full_path = os.path.join(settings.MEDIA_ROOT, media_path)
if os.path.exists(full_path):
media_files.append(full_path)
# 确定输出路径
if not output_path:
timestamp = timezone.now().strftime('%Y%m%d_%H%M%S')
if include_media:
output_path = f'articles_export_{timestamp}.zip'
else:
output_path = f'articles_export_{timestamp}.{format_type}'
# 执行导出
if include_media:
self.export_with_media(articles_data, media_files, output_path, format_type)
else:
if format_type == 'json':
self.export_as_json(articles_data, output_path)
elif format_type == 'csv':
self.export_as_csv(articles_data, output_path)
else:
self.stdout.write(self.style.ERROR('不支持的格式,仅支持 json 或 csv'))
return
self.stdout.write(self.style.SUCCESS(f'成功导出 {len(articles_data)} 篇文章到 {output_path}'))
def export_as_json(self, articles_data, output_path):
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(articles_data, f, ensure_ascii=False, indent=2)
def export_as_csv(self, articles_data, output_path):
if not articles_data:
return
# 打开CSV文件
with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['id', 'title', 'website', 'url', 'pub_date', 'content', 'created_at', 'media_files']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for article_data in articles_data:
# 将列表转换为字符串以便在CSV中存储
article_data['media_files'] = ';'.join(article_data['media_files']) if article_data['media_files'] else ''
writer.writerow(article_data)
def export_with_media(self, articles_data, media_files, output_path, format_type):
# 创建ZIP文件
with zipfile.ZipFile(output_path, 'w') as zipf:
# 添加文章数据文件
data_filename = f'articles.{format_type}'
if format_type == 'json':
json_data = json.dumps(articles_data, ensure_ascii=False, indent=2)
zipf.writestr(data_filename, json_data)
elif format_type == 'csv':
# 创建CSV内容
if articles_data:
import io
csv_buffer = io.StringIO()
fieldnames = ['id', 'title', 'website', 'url', 'pub_date', 'content', 'created_at', 'media_files']
writer = csv.DictWriter(csv_buffer, fieldnames=fieldnames)
writer.writeheader()
for article_data in articles_data:
article_data['media_files'] = ';'.join(article_data['media_files']) if article_data['media_files'] else ''
writer.writerow(article_data)
zipf.writestr(data_filename, csv_buffer.getvalue())
# 添加媒体文件
for media_path in media_files:
arcname = os.path.join('media', os.path.relpath(media_path, settings.MEDIA_ROOT))
zipf.write(media_path, arcname)

View File

@@ -1,5 +1,6 @@
from django.db import models from django.db import models
class Website(models.Model): class Website(models.Model):
name = models.CharField(max_length=100, unique=True) name = models.CharField(max_length=100, unique=True)
base_url = models.URLField() base_url = models.URLField()

View File

@@ -3,15 +3,75 @@
<head> <head>
<meta charset="UTF-8"/> <meta charset="UTF-8"/>
<title>{{ article.title }}</title> <title>{{ article.title }}</title>
<style>
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
line-height: 1.6;
color: #333;
max-width: 800px;
margin: 0 auto;
padding: 20px;
background-color: #f8f9fa;
}
.article-container {
background: white;
border-radius: 8px;
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
padding: 30px;
margin-bottom: 20px;
}
h1 {
color: #2c3e50;
border-bottom: 2px solid #3498db;
padding-bottom: 10px;
margin-top: 0;
}
.meta {
color: #7f8c8d;
font-size: 0.9em;
margin-bottom: 20px;
}
hr {
border: 0;
height: 1px;
background: #ecf0f1;
margin: 20px 0;
}
.content {
font-size: 16px;
}
.content img {
max-width: 100%;
height: auto;
border-radius: 4px;
margin: 10px 0;
}
.back-link {
display: inline-block;
padding: 10px 20px;
background-color: #3498db;
color: white;
text-decoration: none;
border-radius: 4px;
transition: background-color 0.3s;
}
.back-link:hover {
background-color: #2980b9;
}
</style>
</head> </head>
<body> <body>
<div class="article-container">
<h1>{{ article.title }}</h1> <h1>{{ article.title }}</h1>
<div class="meta">
<p>发布时间: {{ article.pub_date|date:"Y-m-d H:i" }}</p> <p>发布时间: {{ article.pub_date|date:"Y-m-d H:i" }}</p>
</div>
<hr/> <hr/>
<div> <div class="content">
{{ article.content|safe }} {{ article.content|safe }}
</div> </div>
<hr/> <hr/>
<p><a href="{% url 'article_list' %}">返回列表</a></p> <p><a href="{% url 'article_list' %}" class="back-link">返回列表</a></p>
</div>
</body> </body>
</html> </html>

View File

@@ -3,15 +3,111 @@
<head> <head>
<meta charset="UTF-8"/> <meta charset="UTF-8"/>
<title>绿色课堂文章列表</title> <title>绿色课堂文章列表</title>
<style>
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
line-height: 1.6;
color: #333;
max-width: 800px;
margin: 0 auto;
padding: 20px;
background-color: #f8f9fa;
}
.container {
background: white;
border-radius: 8px;
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
padding: 30px;
margin-bottom: 20px;
}
h1 {
color: #2c3e50;
border-bottom: 2px solid #3498db;
padding-bottom: 10px;
margin-top: 0;
}
.filters {
margin-bottom: 20px;
padding: 15px;
background-color: #f1f8ff;
border-radius: 5px;
}
.filters a {
display: inline-block;
padding: 5px 10px;
margin: 0 5px 5px 0;
background-color: #e1e8ed;
color: #333;
text-decoration: none;
border-radius: 3px;
}
.filters a.active {
background-color: #3498db;
color: white;
}
ul {
list-style: none;
padding: 0;
}
li {
padding: 10px 0;
border-bottom: 1px solid #ecf0f1;
}
li:last-child {
border-bottom: none;
}
a {
color: #3498db;
text-decoration: none;
}
a:hover {
color: #2980b9;
text-decoration: underline;
}
.meta {
color: #7f8c8d;
font-size: 0.9em;
}
.pagination {
margin-top: 30px;
text-align: center;
padding: 20px 0;
}
.pagination a {
display: inline-block;
padding: 8px 16px;
background-color: #3498db;
color: white;
text-decoration: none;
border-radius: 4px;
margin: 0 5px;
}
.pagination a:hover {
background-color: #2980b9;
}
.pagination span {
margin: 0 10px;
color: #7f8c8d;
}
</style>
</head> </head>
<body> <body>
<div class="container">
<h1>绿色课堂文章列表</h1> <h1>绿色课堂文章列表</h1>
<div class="filters">
<strong>按网站筛选:</strong>
<a href="{% url 'article_list' %}" {% if not selected_website %}class="active"{% endif %}>全部</a>
{% for website in websites %}
<a href="?website={{ website.id }}" {% if selected_website.id == website.id %}class="active"{% endif %}>{{ website.name }}</a>
{% endfor %}
</div>
<ul> <ul>
{% for article in page_obj %} {% for article in page_obj %}
<li> <li>
<a href="{% url 'article_detail' article.id %}">{{ article.title }}</a> <a href="{% url 'article_detail' article.id %}">{{ article.title }}</a>
({{ article.created_at|date:"Y-m-d" }}) <div class="meta">({{ article.website.name }} - {{ article.created_at|date:"Y-m-d" }})</div>
</li> </li>
{% empty %} {% empty %}
<li>暂无文章</li> <li>暂无文章</li>
@@ -20,14 +116,23 @@
<div class="pagination"> <div class="pagination">
{% if page_obj.has_previous %} {% if page_obj.has_previous %}
{% if selected_website %}
<a href="?website={{ selected_website.id }}&page={{ page_obj.previous_page_number }}">上一页</a>
{% else %}
<a href="?page={{ page_obj.previous_page_number }}">上一页</a> <a href="?page={{ page_obj.previous_page_number }}">上一页</a>
{% endif %} {% endif %}
{% endif %}
<span>第 {{ page_obj.number }} 页,共 {{ page_obj.paginator.num_pages }} 页</span> <span>第 {{ page_obj.number }} 页,共 {{ page_obj.paginator.num_pages }} 页</span>
{% if page_obj.has_next %} {% if page_obj.has_next %}
{% if selected_website %}
<a href="?website={{ selected_website.id }}&page={{ page_obj.next_page_number }}">下一页</a>
{% else %}
<a href="?page={{ page_obj.next_page_number }}">下一页</a> <a href="?page={{ page_obj.next_page_number }}">下一页</a>
{% endif %} {% endif %}
{% endif %}
</div>
</div> </div>
</body> </body>
</html> </html>

View File

@@ -7,16 +7,44 @@ from collections import deque
from django.utils import timezone from django.utils import timezone
from django.conf import settings from django.conf import settings
from core.models import Article from core.models import Article
import re
def download_media(url, save_dir): def download_media(url, save_dir):
try: try:
resp = requests.get(url, timeout=15) # 添加请求头以避免403 Forbidden错误
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Referer": urljoin(url, "/")
}
resp = requests.get(url, timeout=15, headers=headers)
resp.raise_for_status() resp.raise_for_status()
except Exception as e: except Exception as e:
print(f"下载失败:{url},错误:{e}") print(f"下载失败:{url},错误:{e}")
return None return None
filename = url.split("/")[-1].split("?")[0] # 更安全地处理文件名,去除查询参数并处理特殊字符
parsed_url = urlparse(url)
filename = os.path.basename(parsed_url.path)
if not filename or '.' not in filename:
# 如果URL路径中没有有效的文件名使用默认名称
filename = 'media_file'
# 清理文件名中的特殊字符
filename = re.sub(r'[^\w\-_\.]', '_', filename)
# 确保文件有扩展名
if '.' not in filename:
content_type = resp.headers.get('content-type', '')
if 'image/jpeg' in content_type:
filename += '.jpg'
elif 'image/png' in content_type:
filename += '.png'
elif 'image/gif' in content_type:
filename += '.gif'
else:
filename += '.bin' # 默认二进制扩展名
os.makedirs(save_dir, exist_ok=True) os.makedirs(save_dir, exist_ok=True)
filepath = os.path.join(save_dir, filename) filepath = os.path.join(save_dir, filename)
@@ -31,6 +59,7 @@ def download_media(url, save_dir):
f.write(resp.content) f.write(resp.content)
return filepath return filepath
def process_article(url, website): def process_article(url, website):
if Article.objects.filter(url=url).exists(): if Article.objects.filter(url=url).exists():
print(f"文章已存在,跳过: {url}") print(f"文章已存在,跳过: {url}")
@@ -41,10 +70,36 @@ def process_article(url, website):
resp.encoding = 'utf-8' resp.encoding = 'utf-8'
soup = BeautifulSoup(resp.text, "html.parser") soup = BeautifulSoup(resp.text, "html.parser")
# 处理不同网站的文章结构
if website.name == "www.news.cn":
title_tag = soup.find("span", class_="title") title_tag = soup.find("span", class_="title")
content_tag = soup.find("span", id="detailContent")
elif website.name == "东方烟草报":
# 优化东方烟草报的标题提取逻辑,按优先级尝试多种选择器
title_tag = (
soup.find("h1", id="title") or # 特别针对带id="title"的h1标签
soup.find("h1") or # 主要标题标签
soup.find("title") or # 页面title标签
soup.find("div", class_="title") or # 某些页面可能使用div.title
soup.find("h2") # 备选标题标签
)
content_tag = soup.find("div", class_="content") # 东方烟草报的内容通常在div.content中
# 增加对另一种内容结构的支持
if not content_tag:
content_tag = soup.find("div", id="gallery")
# 再增加对新内容结构的支持
if not content_tag:
content_tag = soup.find("div", id="ContentText")
else:
# 默认处理方式
title_tag = soup.find("h1") or soup.find("title")
content_tag = soup.find("div", class_="content") or soup.find("div", id="content")
title = title_tag.get_text(strip=True) if title_tag else "无标题" title = title_tag.get_text(strip=True) if title_tag else "无标题"
content_tag = soup.find("span", id="detailContent") # 对标题进行额外处理,去除可能的多余空白字符
title = title.strip() if title else "无标题"
if not content_tag: if not content_tag:
print("没有找到正文,跳过:", url) print("没有找到正文,跳过:", url)
return return
@@ -80,6 +135,7 @@ def process_article(url, website):
) )
print(f"已保存文章及图片:{title}") print(f"已保存文章及图片:{title}")
def is_valid_url(url, base_netloc): def is_valid_url(url, base_netloc):
try: try:
parsed = urlparse(url) parsed = urlparse(url)
@@ -91,6 +147,7 @@ def is_valid_url(url, base_netloc):
except Exception: except Exception:
return False return False
def full_site_crawler(start_url, website, max_pages=1000): def full_site_crawler(start_url, website, max_pages=1000):
headers = {"User-Agent": "Mozilla/5.0"} headers = {"User-Agent": "Mozilla/5.0"}
visited = set() visited = set()
@@ -117,8 +174,30 @@ def full_site_crawler(start_url, website, max_pages=1000):
resp.encoding = 'utf-8' resp.encoding = 'utf-8'
soup = BeautifulSoup(resp.text, "html.parser") soup = BeautifulSoup(resp.text, "html.parser")
# 根据不同网站判断文章页面
is_article_page = False
if website.name == "www.news.cn":
is_article_page = soup.find("span", id="detailContent") is not None
elif website.name == "东方烟草报":
# 对于东方烟草报我们增加基于URL模式的判断
# 东方烟草报的文章URL通常包含/content/和日期格式
parsed_url = urlparse(url)
path = parsed_url.path
is_article_page = (
soup.find("div", class_="content") is not None or
soup.find("div", id="gallery") is not None or
soup.find("div", id="ContentText") is not None or
("/content/" in path and len(path) > 20)
)
else:
# 默认判断逻辑
is_article_page = (
soup.find("div", class_="content") is not None or
soup.find("div", id="content") is not None
)
# 如果是文章页面,则调用文章处理 # 如果是文章页面,则调用文章处理
if soup.find("span", id="detailContent"): if is_article_page:
process_article(url, website) process_article(url, website)
pages_crawled += 1 pages_crawled += 1

View File

@@ -2,6 +2,7 @@ from django.shortcuts import render, get_object_or_404
from django.core.paginator import Paginator from django.core.paginator import Paginator
from .models import Article from .models import Article
def article_list(request): def article_list(request):
""" """
显示文章列表的视图函数 显示文章列表的视图函数
@@ -16,6 +17,7 @@ def article_list(request):
'page_obj': page_obj 'page_obj': page_obj
}) })
def article_detail(request, article_id): def article_detail(request, article_id):
""" """
显示文章详情的视图函数 显示文章详情的视图函数
@@ -24,5 +26,3 @@ def article_detail(request, article_id):
return render(request, 'core/article_detail.html', { return render(request, 'core/article_detail.html', {
'article': article 'article': article
}) })
# Create your views here.

View File

@@ -1,10 +1,15 @@
from django.contrib import admin
from django.urls import path, include
from django.conf import settings from django.conf import settings
from django.conf.urls.static import static from django.conf.urls.static import static
from django.contrib import admin
from django.urls import path, include
# 需要导入自定义的管理站点实例
from core.admin import news_cn_admin, dongfangyancao_admin
urlpatterns = [ urlpatterns = [
path('admin/', admin.site.urls), path('admin/', admin.site.urls),
path('news_cn_admin/', news_cn_admin.urls),
path('dongfangyancao_admin/', dongfangyancao_admin.urls),
# 以后前台访问放 core app 的 urls # 以后前台访问放 core app 的 urls
path('', include('core.urls')), path('', include('core.urls')),
] ]