Compare commits

..

5 Commits

Author SHA1 Message Date
958b087f54 Add Search button 2025-08-11 23:42:14 +08:00
b6bbb90703 Support export for Word 2025-08-11 23:14:56 +08:00
bfd1604872 Add packages 2025-08-11 22:55:57 +08:00
d9d2ea9d99 Add Support dongfangyaocao 2025-08-11 22:20:19 +08:00
6d80326a4e Add Support full site 2025-08-11 14:33:32 +08:00
12 changed files with 1220 additions and 83 deletions

View File

@@ -1,11 +1,349 @@
from django.contrib import admin from django.contrib import admin
from django.contrib.admin import AdminSite
from .models import Website, Article from .models import Website, Article
# 添加actions相关的导入
from django.contrib import messages
from django.http import HttpResponseRedirect
# 添加导出功能所需导入
import csv
from django.http import HttpResponse
import json
# 创建自定义管理站点
class NewsCnAdminSite(AdminSite):
site_header = "新华网管理后台"
site_title = "新华网管理"
index_title = "新华网内容管理"
class DongfangyancaoAdminSite(AdminSite):
site_header = "东方烟草报管理后台"
site_title = "东方烟草报管理"
index_title = "东方烟草报内容管理"
# 实例化管理站点
news_cn_admin = NewsCnAdminSite(name='news_cn_admin')
dongfangyancao_admin = DongfangyancaoAdminSite(name='dongfangyancao_admin')
@admin.register(Website) @admin.register(Website)
class WebsiteAdmin(admin.ModelAdmin): class WebsiteAdmin(admin.ModelAdmin):
list_display = ('name', 'base_url', 'enabled') list_display = ('name', 'base_url', 'enabled')
# 为ArticleAdmin添加自定义动作
@admin.register(Article) @admin.register(Article)
class ArticleAdmin(admin.ModelAdmin): class ArticleAdmin(admin.ModelAdmin):
list_display = ('title', 'website', 'pub_date') list_display = ('title', 'website', 'pub_date')
search_fields = ('title', 'content') search_fields = ('title', 'content')
# 添加动作选项
actions = ['delete_selected_articles', 'delete_dongfangyancao_articles', 'export_as_csv', 'export_as_json',
'export_as_word']
def delete_dongfangyancao_articles(self, request, queryset):
"""一键删除东方烟草报的所有文章"""
# 获取东方烟草报网站对象
try:
dongfangyancao_website = Website.objects.get(name='东方烟草报')
# 删除所有东方烟草报的文章
deleted_count = Article.objects.filter(website=dongfangyancao_website).delete()[0]
self.message_user(request, f"成功删除 {deleted_count} 篇东方烟草报文章", messages.SUCCESS)
except Website.DoesNotExist:
self.message_user(request, "未找到东方烟草报网站配置", messages.ERROR)
# 设置动作的显示名称
delete_dongfangyancao_articles.short_description = "删除所有东方烟草报文章"
def export_as_csv(self, request, queryset):
"""导出选中的文章为CSV格式"""
meta = self.model._meta
field_names = [field.name for field in meta.fields]
response = HttpResponse(content_type='text/csv')
response['Content-Disposition'] = 'attachment; filename={}.csv'.format(meta)
writer = csv.writer(response)
writer.writerow(field_names)
for obj in queryset:
row = [getattr(obj, field)() if callable(getattr(obj, field)) else getattr(obj, field) for field in
field_names]
writer.writerow(row)
return response
export_as_csv.short_description = "导出选中文章为CSV格式"
def export_as_json(self, request, queryset):
"""导出选中的文章为JSON格式"""
response = HttpResponse(content_type='application/json')
response['Content-Disposition'] = 'attachment; filename=articles.json'
# 构造要导出的数据
articles_data = []
for article in queryset:
articles_data.append({
'id': article.id,
'title': article.title,
'website': article.website.name,
'url': article.url,
'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None,
'content': article.content,
'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'),
'media_files': article.media_files
})
# 写入JSON数据
response.write(json.dumps(articles_data, ensure_ascii=False, indent=2))
return response
export_as_json.short_description = "导出选中文章为JSON格式"
def export_as_word(self, request, queryset):
"""导出选中的文章为Word格式"""
try:
from docx import Document
from io import BytesIO
from docx.shared import Inches
except ImportError:
self.message_user(request, "缺少python-docx库请安装: pip install python-docx", messages.ERROR)
return
# 创建Word文档
doc = Document()
doc.add_heading('文章导出', 0)
for article in queryset:
# 添加文章标题
doc.add_heading(article.title, level=1)
# 添加文章元数据
doc.add_paragraph(f"网站: {article.website.name}")
doc.add_paragraph(f"URL: {article.url}")
doc.add_paragraph(
f"发布时间: {article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else 'N/A'}")
doc.add_paragraph(f"创建时间: {article.created_at.strftime('%Y-%m-%d %H:%M:%S')}")
# 添加文章内容
doc.add_heading('内容', level=2)
# 简单处理HTML内容移除标签并处理图片
from bs4 import BeautifulSoup
soup = BeautifulSoup(article.content, 'html.parser')
# 处理内容中的图片
for img in soup.find_all('img'):
src = img.get('src', '')
if src:
# 尝试添加图片到文档
try:
import os
from django.conf import settings
import requests
from io import BytesIO
# 构建完整的图片路径
if src.startswith('http'):
# 网络图片
response = requests.get(src, timeout=10)
image_stream = BytesIO(response.content)
doc.add_picture(image_stream, width=Inches(4.0))
else:
# 本地图片
full_path = os.path.join(settings.MEDIA_ROOT, src.lstrip('/'))
if os.path.exists(full_path):
doc.add_picture(full_path, width=Inches(4.0))
except Exception as e:
# 如果添加图片失败添加图片URL作为文本
doc.add_paragraph(f"[图片: {src}]")
# 移除原始img标签
img.decompose()
content_text = soup.get_text()
doc.add_paragraph(content_text)
# 添加媒体文件信息
if article.media_files:
doc.add_heading('媒体文件', level=2)
for media_file in article.media_files:
try:
import os
from django.conf import settings
from io import BytesIO
import requests
full_path = os.path.join(settings.MEDIA_ROOT, media_file)
if os.path.exists(full_path):
# 添加图片到文档
doc.add_picture(full_path, width=Inches(4.0))
else:
# 如果是URL格式的媒体文件
if media_file.startswith('http'):
response = requests.get(media_file, timeout=10)
image_stream = BytesIO(response.content)
doc.add_picture(image_stream, width=Inches(4.0))
else:
doc.add_paragraph(media_file)
except Exception as e:
doc.add_paragraph(media_file)
# 添加分页符
doc.add_page_break()
# 保存到内存
buffer = BytesIO()
doc.save(buffer)
buffer.seek(0)
# 创建HttpResponse
from django.http import HttpResponse
response = HttpResponse(buffer.getvalue(),
content_type='application/vnd.openxmlformats-officedocument.wordprocessingml.document')
response['Content-Disposition'] = 'attachment; filename=articles.docx'
return response
export_as_word.short_description = "导出选中文章为Word格式"
# 为不同网站创建专门的文章管理类
class NewsCnArticleAdmin(admin.ModelAdmin):
list_display = ('title', 'pub_date')
search_fields = ('title', 'content')
list_filter = ('pub_date',)
actions = ['export_as_csv', 'export_as_json']
def get_queryset(self, request):
qs = super().get_queryset(request)
# 只显示新华网的文章
return qs.filter(website__name='www.news.cn')
def export_as_csv(self, request, queryset):
"""导出选中的文章为CSV格式"""
meta = self.model._meta
field_names = [field.name for field in meta.fields if field.name != 'content'] # 排除content字段以减小CSV大小
response = HttpResponse(content_type='text/csv')
response['Content-Disposition'] = 'attachment; filename=news_cn_articles.csv'
writer = csv.writer(response)
writer.writerow(field_names)
for obj in queryset:
row = []
for field in field_names:
value = getattr(obj, field)
if callable(value):
value = value()
if field == 'website':
value = value.name
row.append(value)
writer.writerow(row)
return response
export_as_csv.short_description = "导出选中文章为CSV格式"
def export_as_json(self, request, queryset):
"""导出选中的文章为JSON格式"""
response = HttpResponse(content_type='application/json')
response['Content-Disposition'] = 'attachment; filename=news_cn_articles.json'
# 构造要导出的数据
articles_data = []
for article in queryset:
articles_data.append({
'id': article.id,
'title': article.title,
'website': article.website.name,
'url': article.url,
'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None,
'content': article.content,
'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'),
'media_files': article.media_files
})
# 写入JSON数据
response.write(json.dumps(articles_data, ensure_ascii=False, indent=2))
return response
export_as_json.short_description = "导出选中文章为JSON格式"
class DongfangyancaoArticleAdmin(admin.ModelAdmin):
list_display = ('title', 'pub_date')
search_fields = ('title', 'content')
list_filter = ('pub_date',)
# 添加动作选项
actions = ['delete_selected_articles', 'delete_all_articles', 'export_as_csv', 'export_as_json']
def get_queryset(self, request):
qs = super().get_queryset(request)
# 只显示东方烟草报的文章
return qs.filter(website__name='东方烟草报')
def delete_all_articles(self, request, queryset):
"""删除当前筛选的所有文章(东方烟草报的所有文章)"""
# 删除所有东方烟草报的文章
deleted_count = self.get_queryset(request).delete()[0]
self.message_user(request, f"成功删除 {deleted_count} 篇文章", messages.SUCCESS)
# 设置动作的显示名称
delete_all_articles.short_description = "删除所有当前筛选的文章"
def export_as_csv(self, request, queryset):
"""导出选中的文章为CSV格式"""
meta = self.model._meta
field_names = [field.name for field in meta.fields if field.name != 'content'] # 排除content字段以减小CSV大小
response = HttpResponse(content_type='text/csv')
response['Content-Disposition'] = 'attachment; filename=dongfangyancao_articles.csv'
writer = csv.writer(response)
writer.writerow(field_names)
for obj in queryset:
row = []
for field in field_names:
value = getattr(obj, field)
if callable(value):
value = value()
if field == 'website':
value = value.name
row.append(value)
writer.writerow(row)
return response
export_as_csv.short_description = "导出选中文章为CSV格式"
def export_as_json(self, request, queryset):
"""导出选中的文章为JSON格式"""
response = HttpResponse(content_type='application/json')
response['Content-Disposition'] = 'attachment; filename=dongfangyancao_articles.json'
# 构造要导出的数据
articles_data = []
for article in queryset:
articles_data.append({
'id': article.id,
'title': article.title,
'website': article.website.name,
'url': article.url,
'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None,
'content': article.content,
'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'),
'media_files': article.media_files
})
# 写入JSON数据
response.write(json.dumps(articles_data, ensure_ascii=False, indent=2))
return response
export_as_json.short_description = "导出选中文章为JSON格式"
# 在各自的管理站点中注册模型
news_cn_admin.register(Website, WebsiteAdmin)
news_cn_admin.register(Article, NewsCnArticleAdmin)
dongfangyancao_admin.register(Website, WebsiteAdmin)
dongfangyancao_admin.register(Article, DongfangyancaoArticleAdmin)

View File

@@ -0,0 +1,20 @@
from django.core.management.base import BaseCommand
from core.models import Website
from core.utils import full_site_crawler
class Command(BaseCommand):
help = "全站递归爬取 东方烟草报"
def handle(self, *args, **kwargs):
website, created = Website.objects.get_or_create(
name="东方烟草报",
defaults={
'article_list_url': 'https://www.eastobacco.com/',
'article_selector': 'a'
}
)
start_url = "https://www.eastobacco.com/"
self.stdout.write(f"开始全站爬取: {start_url}")
full_site_crawler(start_url, website, max_pages=500)
self.stdout.write("爬取完成")

View File

@@ -1,18 +1,20 @@
from django.core.management.base import BaseCommand from django.core.management.base import BaseCommand
from core.models import Website from core.models import Website
from core.utils import crawl_xinhua_list from core.utils import full_site_crawler
class Command(BaseCommand): class Command(BaseCommand):
help = '批量爬取新华网文章' help = "全站递归爬取 www.news.cn"
def handle(self, *args, **options): def handle(self, *args, **kwargs):
list_url = "https://www.news.cn/legal/index.html" website, created = Website.objects.get_or_create(
try: name="www.news.cn",
website = Website.objects.get(base_url="https://www.news.cn/") defaults={
except Website.DoesNotExist: 'article_list_url': 'https://www.news.cn/',
self.stdout.write(self.style.ERROR("网站 https://www.news.cn/ 不存在,请先后台添加")) 'article_selector': 'a'
return }
)
self.stdout.write(f"开始爬取文章列表页: {list_url}") start_url = "https://www.news.cn/"
crawl_xinhua_list(list_url, website) self.stdout.write(f"开始全站爬取: {start_url}")
self.stdout.write(self.style.SUCCESS("批量爬取完成")) full_site_crawler(start_url, website, max_pages=500)
self.stdout.write("爬取完成")

View File

@@ -0,0 +1,21 @@
from django.core.management.base import BaseCommand
from core.models import Website
from core.utils import crawl_xinhua_list
class Command(BaseCommand):
help = '批量爬取新华网文章'
def handle(self, *args, **options):
# 添加使用标记,确认该命令是否被调用
self.stdout.write(self.style.WARNING("crawl_xinhua command is being used"))
list_url = "https://www.news.cn/legal/index.html"
try:
website = Website.objects.get(base_url="https://www.news.cn/")
except Website.DoesNotExist:
self.stdout.write(self.style.ERROR("网站 https://www.news.cn/ 不存在,请先后台添加"))
return
self.stdout.write(f"开始爬取文章列表页: {list_url}")
crawl_xinhua_list(list_url, website)
self.stdout.write(self.style.SUCCESS("批量爬取完成"))

View File

@@ -0,0 +1,311 @@
from django.core.management.base import BaseCommand
from core.models import Article, Website
import json
import csv
import os
from django.conf import settings
from django.core.files.storage import default_storage
import zipfile
from django.utils import timezone
class Command(BaseCommand):
help = '导出文章及相关的媒体文件(图片、视频等)'
def add_arguments(self, parser):
parser.add_argument('--format', type=str, default='json', help='导出格式: json 或 csv')
parser.add_argument('--website', type=str, help='指定网站名称导出特定网站的文章')
parser.add_argument('--output', type=str, default='', help='输出文件路径')
parser.add_argument('--include-media', action='store_true', help='包含媒体文件')
def handle(self, *args, **options):
format_type = options['format'].lower()
website_name = options['website']
output_path = options['output']
include_media = options['include_media']
# 获取文章查询集
articles = Article.objects.all()
if website_name:
try:
website = Website.objects.get(name=website_name)
articles = articles.filter(website=website)
except Website.DoesNotExist:
self.stdout.write(self.style.ERROR(f'网站 "{website_name}" 不存在'))
return
if not articles.exists():
self.stdout.write(self.style.WARNING('没有找到文章'))
return
# 准备导出数据
articles_data = []
media_files = []
for article in articles:
article_data = {
'id': article.id,
'title': article.title,
'website': article.website.name,
'url': article.url,
'pub_date': article.pub_date.isoformat() if article.pub_date else None,
'content': article.content,
'created_at': article.created_at.isoformat(),
'media_files': article.media_files
}
articles_data.append(article_data)
# 收集媒体文件路径
if include_media:
for media_path in article.media_files:
full_path = os.path.join(settings.MEDIA_ROOT, media_path)
if os.path.exists(full_path):
media_files.append(full_path)
# 确定输出路径
if not output_path:
timestamp = timezone.now().strftime('%Y%m%d_%H%M%S')
if include_media:
output_path = f'articles_export_{timestamp}.zip'
else:
output_path = f'articles_export_{timestamp}.{format_type}'
# 执行导出
if include_media:
self.export_with_media(articles_data, media_files, output_path, format_type)
else:
if format_type == 'json':
self.export_as_json(articles_data, output_path)
elif format_type == 'csv':
self.export_as_csv(articles_data, output_path)
# 添加Word格式导出支持
elif format_type == 'docx':
self.export_as_word(articles_data, output_path)
else:
self.stdout.write(self.style.ERROR('不支持的格式,仅支持 json、csv 或 docx'))
return
self.stdout.write(self.style.SUCCESS(f'成功导出 {len(articles_data)} 篇文章到 {output_path}'))
def export_as_json(self, articles_data, output_path):
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(articles_data, f, ensure_ascii=False, indent=2)
def export_as_csv(self, articles_data, output_path):
if not articles_data:
return
# 打开CSV文件
with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['id', 'title', 'website', 'url', 'pub_date', 'content', 'created_at', 'media_files']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for article_data in articles_data:
# 将列表转换为字符串以便在CSV中存储
article_data['media_files'] = ';'.join(article_data['media_files']) if article_data[
'media_files'] else ''
writer.writerow(article_data)
# 添加Word格式导出方法
def export_as_word(self, articles_data, output_path):
try:
from docx import Document
from docx.shared import Inches
except ImportError:
self.stdout.write(self.style.ERROR('缺少python-docx库请安装: pip install python-docx'))
return
# 创建Word文档
doc = Document()
doc.add_heading('文章导出', 0)
for article_data in articles_data:
# 添加文章标题
doc.add_heading(article_data['title'], level=1)
# 添加文章元数据
doc.add_paragraph(f"网站: {article_data['website']}")
doc.add_paragraph(f"URL: {article_data['url']}")
doc.add_paragraph(f"发布时间: {article_data['pub_date']}")
doc.add_paragraph(f"创建时间: {article_data['created_at']}")
# 添加文章内容
doc.add_heading('内容', level=2)
# 简单处理HTML内容移除标签
from bs4 import BeautifulSoup
soup = BeautifulSoup(article_data['content'], 'html.parser')
# 处理内容中的图片
for img in soup.find_all('img'):
src = img.get('src', '')
if src:
# 尝试添加图片到文档
try:
import os
from django.conf import settings
import requests
from io import BytesIO
# 构建完整的图片路径
if src.startswith('http'):
# 网络图片
response = requests.get(src, timeout=10)
image_stream = BytesIO(response.content)
doc.add_picture(image_stream, width=Inches(4.0))
else:
# 本地图片
full_path = os.path.join(settings.MEDIA_ROOT, src.lstrip('/'))
if os.path.exists(full_path):
doc.add_picture(full_path, width=Inches(4.0))
except Exception as e:
# 如果添加图片失败添加图片URL作为文本
doc.add_paragraph(f"[图片: {src}]")
# 移除原始img标签
img.decompose()
content_text = soup.get_text()
doc.add_paragraph(content_text)
# 添加媒体文件信息
if article_data['media_files']:
doc.add_heading('媒体文件', level=2)
for media_file in article_data['media_files']:
try:
import os
from django.conf import settings
from io import BytesIO
import requests
full_path = os.path.join(settings.MEDIA_ROOT, media_file)
if os.path.exists(full_path):
# 添加图片到文档
doc.add_picture(full_path, width=Inches(4.0))
else:
# 如果是URL格式的媒体文件
if media_file.startswith('http'):
response = requests.get(media_file, timeout=10)
image_stream = BytesIO(response.content)
doc.add_picture(image_stream, width=Inches(4.0))
else:
doc.add_paragraph(media_file)
except Exception as e:
doc.add_paragraph(media_file)
# 添加分页符
doc.add_page_break()
# 保存文档
doc.save(output_path)
def export_with_media(self, articles_data, media_files, output_path, format_type):
# 创建ZIP文件
with zipfile.ZipFile(output_path, 'w') as zipf:
# 添加文章数据文件
data_filename = f'articles.{format_type}'
if format_type == 'json':
json_data = json.dumps(articles_data, ensure_ascii=False, indent=2)
zipf.writestr(data_filename, json_data)
elif format_type == 'csv':
# 创建CSV内容
if articles_data:
import io
csv_buffer = io.StringIO()
fieldnames = ['id', 'title', 'website', 'url', 'pub_date', 'content', 'created_at', 'media_files']
writer = csv.DictWriter(csv_buffer, fieldnames=fieldnames)
writer.writeheader()
for article_data in articles_data:
article_data['media_files'] = ';'.join(article_data['media_files']) if article_data[
'media_files'] else ''
writer.writerow(article_data)
zipf.writestr(data_filename, csv_buffer.getvalue())
# 添加Word格式支持
elif format_type == 'docx':
# 创建Word文档并保存到ZIP
try:
from docx import Document
from docx.shared import Inches
from io import BytesIO
doc = Document()
doc.add_heading('文章导出', 0)
for article_data in articles_data:
doc.add_heading(article_data['title'], level=1)
doc.add_paragraph(f"网站: {article_data['website']}")
doc.add_paragraph(f"URL: {article_data['url']}")
doc.add_paragraph(f"发布时间: {article_data['pub_date']}")
doc.add_paragraph(f"创建时间: {article_data['created_at']}")
doc.add_heading('内容', level=2)
from bs4 import BeautifulSoup
soup = BeautifulSoup(article_data['content'], 'html.parser')
# 处理内容中的图片
for img in soup.find_all('img'):
src = img.get('src', '')
if src:
# 尝试添加图片到文档
try:
import os
from django.conf import settings
import requests
# 构建完整的图片路径
if src.startswith('http'):
# 网络图片
response = requests.get(src, timeout=10)
image_stream = BytesIO(response.content)
doc.add_picture(image_stream, width=Inches(4.0))
else:
# 本地图片
full_path = os.path.join(settings.MEDIA_ROOT, src.lstrip('/'))
if os.path.exists(full_path):
doc.add_picture(full_path, width=Inches(4.0))
except Exception as e:
# 如果添加图片失败添加图片URL作为文本
doc.add_paragraph(f"[图片: {src}]")
# 移除原始img标签
img.decompose()
content_text = soup.get_text()
doc.add_paragraph(content_text)
if article_data['media_files']:
doc.add_heading('媒体文件', level=2)
for media_file in article_data['media_files']:
try:
import os
from django.conf import settings
full_path = os.path.join(settings.MEDIA_ROOT, media_file)
if os.path.exists(full_path):
# 添加图片到文档
doc.add_picture(full_path, width=Inches(4.0))
else:
# 如果是URL格式的媒体文件
if media_file.startswith('http'):
response = requests.get(media_file, timeout=10)
image_stream = BytesIO(response.content)
doc.add_picture(image_stream, width=Inches(4.0))
else:
doc.add_paragraph(media_file)
except Exception as e:
doc.add_paragraph(media_file)
doc.add_page_break()
# 将文档保存到内存中再写入ZIP
doc_buffer = BytesIO()
doc.save(doc_buffer)
doc_buffer.seek(0)
zipf.writestr(data_filename, doc_buffer.read())
except ImportError:
zipf.writestr(data_filename, "错误缺少python-docx库无法生成Word文档")
# 添加媒体文件
for media_path in media_files:
arcname = os.path.join('media', os.path.relpath(media_path, settings.MEDIA_ROOT))
zipf.write(media_path, arcname)

View File

@@ -1,5 +1,6 @@
from django.db import models from django.db import models
class Website(models.Model): class Website(models.Model):
name = models.CharField(max_length=100, unique=True) name = models.CharField(max_length=100, unique=True)
base_url = models.URLField() base_url = models.URLField()

View File

@@ -1,17 +1,85 @@
<!DOCTYPE html> <!DOCTYPE html>
<html lang="zh"> <html lang="zh">
<head> <head>
<meta charset="UTF-8" /> <meta charset="UTF-8"/>
<title>{{ article.title }}</title> <title>{{ article.title }}</title>
<style>
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
line-height: 1.6;
color: #333;
max-width: 1200px; /* 修改:同步调整页面最大宽度与列表页一致 */
margin: 0 auto;
padding: 20px;
background-color: #f8f9fa;
}
.article-container {
background: white;
border-radius: 8px;
box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
padding: 30px;
margin-bottom: 20px;
}
h1 {
color: #2c3e50;
border-bottom: 2px solid #3498db;
padding-bottom: 10px;
margin-top: 0;
}
.meta {
color: #7f8c8d;
font-size: 0.9em;
margin-bottom: 20px;
}
hr {
border: 0;
height: 1px;
background: #ecf0f1;
margin: 20px 0;
}
.content {
font-size: 16px;
}
.content img {
max-width: 100%;
height: auto;
border-radius: 4px;
margin: 10px 0;
}
.back-link {
display: inline-block;
padding: 10px 20px;
background-color: #3498db;
color: white;
text-decoration: none;
border-radius: 4px;
transition: background-color 0.3s;
}
.back-link:hover {
background-color: #2980b9;
}
</style>
</head> </head>
<body> <body>
<div class="article-container">
<h1>{{ article.title }}</h1> <h1>{{ article.title }}</h1>
<p>发布时间: {{ article.pub_date|date:"Y-m-d H:i" }}</p> <div class="meta">
<hr /> <p>发布时间: {{ article.pub_date|date:"Y-m-d H:i" }}</p>
<div> </div>
<hr/>
<div class="content">
{{ article.content|safe }} {{ article.content|safe }}
</div> </div>
<hr /> <hr/>
<p><a href="{% url 'article_list' %}">返回列表</a></p> <p><a href="{% url 'article_list' %}" class="back-link">返回列表</a></p>
</div>
</body> </body>
</html> </html>

View File

@@ -1,33 +1,252 @@
<!DOCTYPE html> <!DOCTYPE html>
<html lang="zh"> <html lang="zh">
<head> <head>
<meta charset="UTF-8" /> <meta charset="UTF-8"/>
<title>绿色课堂文章列表</title> <title>绿色课堂文章列表</title>
<style>
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
line-height: 1.6;
color: #333;
max-width: 1200px; /* 修改:增加页面最大宽度 */
margin: 0 auto;
padding: 20px;
background-color: #f8f9fa;
}
.container {
background: white;
border-radius: 8px;
box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
padding: 30px;
margin-bottom: 20px;
}
h1 {
color: #2c3e50;
border-bottom: 2px solid #3498db;
padding-bottom: 10px;
margin-top: 0;
}
.filters {
margin-bottom: 20px;
padding: 15px;
background-color: #f1f8ff;
border-radius: 5px;
}
.filters a {
display: inline-block;
padding: 5px 10px;
margin: 0 5px 5px 0;
background-color: #e1e8ed;
color: #333;
text-decoration: none;
border-radius: 3px;
}
.filters a.active {
background-color: #3498db;
color: white;
}
ul {
list-style: none;
padding: 0;
}
li {
padding: 10px 0;
border-bottom: 1px solid #ecf0f1;
}
li:last-child {
border-bottom: none;
}
a {
color: #3498db;
text-decoration: none;
}
a:hover {
color: #2980b9;
text-decoration: underline;
}
.meta {
color: #7f8c8d;
font-size: 0.9em;
}
.pagination {
margin-top: 30px;
text-align: center;
padding: 20px 0;
}
.pagination a {
display: inline-block;
padding: 8px 16px;
background-color: #3498db;
color: white;
text-decoration: none;
border-radius: 4px;
margin: 0 2px; /* 修改:调整页码间距 */
}
.pagination a:hover {
background-color: #2980b9;
}
.pagination span {
margin: 0 10px;
color: #7f8c8d;
}
/* 新增:当前页码样式 */
.pagination .current {
background-color: #2980b9;
cursor: default;
}
/* 新增:省略号样式 */
.pagination .ellipsis {
display: inline-block;
padding: 8px 4px;
color: #7f8c8d;
}
/* 新增:搜索框样式 */
.search-form {
margin-bottom: 20px;
padding: 15px;
background-color: #f1f8ff;
border-radius: 5px;
}
.search-form input[type="text"] {
padding: 8px 12px;
border: 1px solid #ddd;
border-radius: 4px;
width: 300px;
margin-right: 10px;
}
.search-form input[type="submit"] {
padding: 8px 16px;
background-color: #3498db;
color: white;
border: none;
border-radius: 4px;
cursor: pointer;
}
.search-form input[type="submit"]:hover {
background-color: #2980b9;
}
.search-info {
color: #7f8c8d;
font-size: 0.9em;
margin-bottom: 10px;
}
</style>
</head> </head>
<body> <body>
<div class="container">
<h1>绿色课堂文章列表</h1> <h1>绿色课堂文章列表</h1>
<!-- 新增:返回首页链接 -->
<div style="margin-bottom: 20px;">
<a href="{% url 'article_list' %}" style="color: #3498db; text-decoration: none;">&larr; 返回首页</a>
</div>
<!-- 新增:搜索表单 -->
<div class="search-form">
<form method="get">
<input type="text" name="q" placeholder="输入关键词搜索文章..." value="{{ search_query }}">
{% if selected_website %}
<input type="hidden" name="website" value="{{ selected_website.id }}">
{% endif %}
<input type="submit" value="搜索">
</form>
</div>
<div class="filters">
<strong>按网站筛选:</strong>
<a href="{% url 'article_list' %}{% if search_query %}?q={{ search_query }}{% endif %}" {% if not selected_website %}class="active" {% endif %}>全部</a>
{% for website in websites %}
<a href="?website={{ website.id }}{% if search_query %}&q={{ search_query }}{% endif %}" {% if selected_website and selected_website.id == website.id %}class="active" {% endif %}>{{ website.name }}</a>
{% endfor %}
</div>
<!-- 新增:搜索结果信息 -->
{% if search_query %}
<div class="search-info">
搜索 "{{ search_query }}" 找到 {{ page_obj.paginator.count }} 篇文章
<a href="{% if selected_website %}?website={{ selected_website.id }}{% else %}{% url 'article_list' %}{% endif %}">清除搜索</a>
</div>
{% endif %}
<ul> <ul>
{% for article in page_obj %} {% for article in page_obj %}
<li> <li>
<a href="{% url 'article_detail' article.id %}">{{ article.title }}</a> <a href="{% url 'article_detail' article.id %}">{{ article.title }}</a>
({{ article.created_at|date:"Y-m-d" }}) <div class="meta">({{ article.website.name }} - {{ article.created_at|date:"Y-m-d" }})</div>
</li> </li>
{% empty %} {% empty %}
<li>暂无文章</li> <li>暂无文章</li>
{% endfor %} {% endfor %}
</ul> </ul>
<div class="pagination"> <div class="pagination">
{% if page_obj.has_previous %} {% if page_obj.has_previous %}
<a href="?page={{ page_obj.previous_page_number }}">上一页</a> {% if selected_website %}
<a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page=1">&laquo; 首页</a>
<a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.previous_page_number }}">上一页</a>
{% else %}
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page=1">&laquo; 首页</a>
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.previous_page_number }}">上一页</a>
{% endif %}
{% endif %} {% endif %}
<span>第 {{ page_obj.number }} 页,共 {{ page_obj.paginator.num_pages }} 页</span> <span>第 {{ page_obj.number }} 页,共 {{ page_obj.paginator.num_pages }} 页</span>
<!-- 修改:优化页码显示逻辑 -->
{% with page_obj.paginator as paginator %}
{% for num in paginator.page_range %}
{% if page_obj.number == num %}
<a href="#" class="current">{{ num }}</a>
{% elif num > page_obj.number|add:'-3' and num < page_obj.number|add:'3' %}
{% if selected_website %}
<a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ num }}">{{ num }}</a>
{% else %}
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ num }}">{{ num }}</a>
{% endif %}
{% elif num == 1 or num == paginator.num_pages %}
{% if selected_website %}
<a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ num }}">{{ num }}</a>
{% else %}
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ num }}">{{ num }}</a>
{% endif %}
{% elif num == page_obj.number|add:'-3' or num == page_obj.number|add:'3' %}
<span class="ellipsis">...</span>
{% endif %}
{% endfor %}
{% endwith %}
{% if page_obj.has_next %} {% if page_obj.has_next %}
<a href="?page={{ page_obj.next_page_number }}">下一页</a> {% if selected_website %}
<a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.next_page_number }}">下一页</a>
<a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.paginator.num_pages }}">末页 &raquo;</a>
{% else %}
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.next_page_number }}">下一页</a>
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.paginator.num_pages }}">末页 &raquo;</a>
{% endif %}
{% endif %} {% endif %}
</div> </div>
</div>
</body> </body>
</html> </html>

View File

@@ -1,20 +1,50 @@
# core/utils.py
import os import os
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse from urllib.parse import urljoin, urlparse
from collections import deque
from django.utils import timezone from django.utils import timezone
from django.conf import settings from django.conf import settings
from core.models import Article from core.models import Article
import re
def download_media(url, save_dir): def download_media(url, save_dir):
try: try:
resp = requests.get(url, timeout=15) # 添加请求头以避免403 Forbidden错误
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Referer": urljoin(url, "/")
}
resp = requests.get(url, timeout=15, headers=headers)
resp.raise_for_status() resp.raise_for_status()
except Exception as e: except Exception as e:
print(f"下载失败:{url},错误:{e}") print(f"下载失败:{url},错误:{e}")
return None return None
filename = url.split("/")[-1].split("?")[0] # 更安全地处理文件名,去除查询参数并处理特殊字符
parsed_url = urlparse(url)
filename = os.path.basename(parsed_url.path)
if not filename or '.' not in filename:
# 如果URL路径中没有有效的文件名使用默认名称
filename = 'media_file'
# 清理文件名中的特殊字符
filename = re.sub(r'[^\w\-_\.]', '_', filename)
# 确保文件有扩展名
if '.' not in filename:
content_type = resp.headers.get('content-type', '')
if 'image/jpeg' in content_type:
filename += '.jpg'
elif 'image/png' in content_type:
filename += '.png'
elif 'image/gif' in content_type:
filename += '.gif'
else:
filename += '.bin' # 默认二进制扩展名
os.makedirs(save_dir, exist_ok=True) os.makedirs(save_dir, exist_ok=True)
filepath = os.path.join(save_dir, filename) filepath = os.path.join(save_dir, filename)
@@ -27,22 +57,51 @@ def download_media(url, save_dir):
with open(filepath, "wb") as f: with open(filepath, "wb") as f:
f.write(resp.content) f.write(resp.content)
return filepath
# 返回相对路径,方便存数据库和展示
return os.path.relpath(filepath, settings.MEDIA_ROOT).replace("\\", "/")
def crawl_xinhua_article(url, website): def process_article(url, website):
if Article.objects.filter(url=url).exists():
print(f"文章已存在,跳过: {url}")
return
headers = {"User-Agent": "Mozilla/5.0"} headers = {"User-Agent": "Mozilla/5.0"}
resp = requests.get(url, headers=headers) resp = requests.get(url, headers=headers)
resp.encoding = 'utf-8' resp.encoding = 'utf-8'
soup = BeautifulSoup(resp.text, "html.parser") soup = BeautifulSoup(resp.text, "html.parser")
title_tag = soup.find("span", class_="title") # 处理不同网站的文章结构
if website.name == "www.news.cn":
title_tag = soup.find("span", class_="title")
content_tag = soup.find("span", id="detailContent")
elif website.name == "东方烟草报":
# 优化东方烟草报的标题提取逻辑,按优先级尝试多种选择器
title_tag = (
soup.find("h1", id="title") or # 特别针对带id="title"的h1标签
soup.find("h1") or # 主要标题标签
soup.find("title") or # 页面title标签
soup.find("div", class_="title") or # 某些页面可能使用div.title
soup.find("h2") # 备选标题标签
)
content_tag = soup.find("div", class_="content") # 东方烟草报的内容通常在div.content中
# 增加对另一种内容结构的支持
if not content_tag:
content_tag = soup.find("div", id="gallery")
# 再增加对新内容结构的支持
if not content_tag:
content_tag = soup.find("div", id="ContentText")
else:
# 默认处理方式
title_tag = soup.find("h1") or soup.find("title")
content_tag = soup.find("div", class_="content") or soup.find("div", id="content")
title = title_tag.get_text(strip=True) if title_tag else "无标题" title = title_tag.get_text(strip=True) if title_tag else "无标题"
content_tag = soup.find("span", id="detailContent") # 对标题进行额外处理,去除可能的多余空白字符
title = title.strip() if title else "无标题"
if not content_tag: if not content_tag:
print(f"没有找到正文,跳过文章: {url}") print("没有找到正文,跳过:", url)
return return
imgs = content_tag.find_all("img") imgs = content_tag.find_all("img")
@@ -56,22 +115,16 @@ def crawl_xinhua_article(url, website):
src = img.get("src") src = img.get("src")
if not src: if not src:
continue continue
# 这里用文章URL作为基准拼接相对路径避免错误
if not src.startswith("http"): if not src.startswith("http"):
src = urljoin(url, src) src = urljoin(url, src)
local_path = download_media(src, save_dir)
local_rel_path = download_media(src, save_dir) if local_path:
if local_rel_path: rel_path = os.path.relpath(local_path, settings.MEDIA_ROOT)
img["src"] = settings.MEDIA_URL + local_rel_path img["src"] = settings.MEDIA_URL + rel_path.replace("\\", "/")
media_files.append(local_rel_path) media_files.append(rel_path.replace("\\", "/"))
content_html = str(content_tag) content_html = str(content_tag)
if Article.objects.filter(url=url).exists():
print(f"文章已存在,跳过: {url}")
return
article = Article.objects.create( article = Article.objects.create(
website=website, website=website,
title=title, title=title,
@@ -82,22 +135,74 @@ def crawl_xinhua_article(url, website):
) )
print(f"已保存文章及图片:{title}") print(f"已保存文章及图片:{title}")
def crawl_xinhua_list(list_url, website):
def is_valid_url(url, base_netloc):
try:
parsed = urlparse(url)
if parsed.scheme not in ("http", "https"):
return False
if parsed.netloc != base_netloc:
return False
return True
except Exception:
return False
def full_site_crawler(start_url, website, max_pages=1000):
headers = {"User-Agent": "Mozilla/5.0"} headers = {"User-Agent": "Mozilla/5.0"}
resp = requests.get(list_url, headers=headers) visited = set()
resp.encoding = 'utf-8' queue = deque([start_url])
soup = BeautifulSoup(resp.text, "html.parser")
article_urls = set() base_netloc = urlparse(start_url).netloc
for link in soup.find_all("a", href=True):
href = link["href"]
if href.startswith("https://www.news.cn/legal/") and href.endswith("c.html"):
article_urls.add(href)
print(f"在列表页找到 {len(article_urls)} 篇文章链接") pages_crawled = 0
for url in article_urls:
print("文章链接:", url)
from core.utils import crawl_xinhua_article while queue and pages_crawled < max_pages:
for article_url in article_urls: url = queue.popleft()
crawl_xinhua_article(article_url, website) if url in visited:
continue
print(f"正在爬取:{url}")
visited.add(url)
try:
resp = requests.get(url, headers=headers, timeout=15)
resp.raise_for_status()
except Exception as e:
print(f"请求失败:{url},错误:{e}")
continue
resp.encoding = 'utf-8'
soup = BeautifulSoup(resp.text, "html.parser")
# 根据不同网站判断文章页面
is_article_page = False
if website.name == "www.news.cn":
is_article_page = soup.find("span", id="detailContent") is not None
elif website.name == "东方烟草报":
# 对于东方烟草报我们增加基于URL模式的判断
# 东方烟草报的文章URL通常包含/content/和日期格式
parsed_url = urlparse(url)
path = parsed_url.path
is_article_page = (
soup.find("div", class_="content") is not None or
soup.find("div", id="gallery") is not None or
soup.find("div", id="ContentText") is not None or
("/content/" in path and len(path) > 20)
)
else:
# 默认判断逻辑
is_article_page = (
soup.find("div", class_="content") is not None or
soup.find("div", id="content") is not None
)
# 如果是文章页面,则调用文章处理
if is_article_page:
process_article(url, website)
pages_crawled += 1
# 扩展队列,发现新链接
for link in soup.find_all("a", href=True):
href = urljoin(url, link["href"])
if href not in visited and is_valid_url(href, base_netloc):
queue.append(href)

View File

@@ -1,28 +1,44 @@
from django.shortcuts import render, get_object_or_404 from django.shortcuts import render
from django.core.paginator import Paginator from django.core.paginator import Paginator
from .models import Article from .models import Article, Website
def article_list(request): def article_list(request):
""" # 获取所有启用的网站
显示文章列表的视图函数 websites = Website.objects.filter(enabled=True)
"""
articles = Article.objects.all().order_by('-created_at')
paginator = Paginator(articles, 20) # 每页显示10篇文章
# 获取筛选网站
selected_website = None
articles = Article.objects.all()
website_id = request.GET.get('website')
if website_id:
try:
selected_website = Website.objects.get(id=website_id)
articles = articles.filter(website=selected_website)
except Website.DoesNotExist:
pass
# 新增:处理关键词搜索
search_query = request.GET.get('q')
if search_query:
articles = articles.filter(title__icontains=search_query)
# 按创建时间倒序排列
articles = articles.order_by('-created_at')
# 分页
paginator = Paginator(articles, 10) # 每页显示10篇文章
page_number = request.GET.get('page') page_number = request.GET.get('page')
page_obj = paginator.get_page(page_number) page_obj = paginator.get_page(page_number)
return render(request, 'core/article_list.html', { return render(request, 'core/article_list.html', {
'page_obj': page_obj 'page_obj': page_obj,
'websites': websites,
'selected_website': selected_website,
# 新增:传递搜索关键词到模板
'search_query': search_query
}) })
def article_detail(request, article_id): def article_detail(request, article_id):
""" article = Article.objects.get(id=article_id)
显示文章详情的视图函数 return render(request, 'core/article_detail.html', {'article': article})
"""
article = get_object_or_404(Article, id=article_id)
return render(request, 'core/article_detail.html', {
'article': article
})
# Create your views here.

View File

@@ -1,13 +1,18 @@
from django.contrib import admin
from django.urls import path, include
from django.conf import settings from django.conf import settings
from django.conf.urls.static import static from django.conf.urls.static import static
from django.contrib import admin
from django.urls import path, include
# 需要导入自定义的管理站点实例
from core.admin import news_cn_admin, dongfangyancao_admin
urlpatterns = [ urlpatterns = [
path('admin/', admin.site.urls), path('admin/', admin.site.urls),
path('news_cn_admin/', news_cn_admin.urls),
path('dongfangyancao_admin/', dongfangyancao_admin.urls),
# 以后前台访问放 core app 的 urls # 以后前台访问放 core app 的 urls
path('', include('core.urls')), path('', include('core.urls')),
] ]
if settings.DEBUG: if settings.DEBUG:
urlpatterns += static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT) urlpatterns += static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT)

31
requirements.txt Normal file
View File

@@ -0,0 +1,31 @@
asgiref==3.9.1
asttokens==3.0.0
beautifulsoup4==4.13.4
bs4==0.0.2
certifi==2025.8.3
charset-normalizer==3.4.3
decorator==5.2.1
Django==5.1
executing==2.2.0
idna==3.10
ipython==9.4.0
ipython_pygments_lexers==1.1.1
jedi==0.19.2
lxml==6.0.0
matplotlib-inline==0.1.7
parso==0.8.4
pexpect==4.9.0
prompt_toolkit==3.0.51
ptyprocess==0.7.0
pure_eval==0.2.3
Pygments==2.19.2
python-docx==1.2.0
requests==2.32.4
soupsieve==2.7
sqlparse==0.5.3
stack-data==0.6.3
traitlets==5.14.3
typing_extensions==4.14.1
urllib3==2.5.0
uv==0.8.8
wcwidth==0.2.13