Compare commits
5 Commits
969d46b070
...
958b087f54
| Author | SHA1 | Date | |
|---|---|---|---|
| 958b087f54 | |||
| b6bbb90703 | |||
| bfd1604872 | |||
| d9d2ea9d99 | |||
| 6d80326a4e |
338
core/admin.py
338
core/admin.py
@@ -1,11 +1,349 @@
|
||||
from django.contrib import admin
|
||||
from django.contrib.admin import AdminSite
|
||||
from .models import Website, Article
|
||||
# 添加actions相关的导入
|
||||
from django.contrib import messages
|
||||
from django.http import HttpResponseRedirect
|
||||
# 添加导出功能所需导入
|
||||
import csv
|
||||
from django.http import HttpResponse
|
||||
import json
|
||||
|
||||
|
||||
# 创建自定义管理站点
|
||||
class NewsCnAdminSite(AdminSite):
|
||||
site_header = "新华网管理后台"
|
||||
site_title = "新华网管理"
|
||||
index_title = "新华网内容管理"
|
||||
|
||||
|
||||
class DongfangyancaoAdminSite(AdminSite):
|
||||
site_header = "东方烟草报管理后台"
|
||||
site_title = "东方烟草报管理"
|
||||
index_title = "东方烟草报内容管理"
|
||||
|
||||
|
||||
# 实例化管理站点
|
||||
news_cn_admin = NewsCnAdminSite(name='news_cn_admin')
|
||||
dongfangyancao_admin = DongfangyancaoAdminSite(name='dongfangyancao_admin')
|
||||
|
||||
|
||||
@admin.register(Website)
|
||||
class WebsiteAdmin(admin.ModelAdmin):
|
||||
list_display = ('name', 'base_url', 'enabled')
|
||||
|
||||
|
||||
# 为ArticleAdmin添加自定义动作
|
||||
@admin.register(Article)
|
||||
class ArticleAdmin(admin.ModelAdmin):
|
||||
list_display = ('title', 'website', 'pub_date')
|
||||
search_fields = ('title', 'content')
|
||||
# 添加动作选项
|
||||
actions = ['delete_selected_articles', 'delete_dongfangyancao_articles', 'export_as_csv', 'export_as_json',
|
||||
'export_as_word']
|
||||
|
||||
def delete_dongfangyancao_articles(self, request, queryset):
|
||||
"""一键删除东方烟草报的所有文章"""
|
||||
# 获取东方烟草报网站对象
|
||||
try:
|
||||
dongfangyancao_website = Website.objects.get(name='东方烟草报')
|
||||
# 删除所有东方烟草报的文章
|
||||
deleted_count = Article.objects.filter(website=dongfangyancao_website).delete()[0]
|
||||
self.message_user(request, f"成功删除 {deleted_count} 篇东方烟草报文章", messages.SUCCESS)
|
||||
except Website.DoesNotExist:
|
||||
self.message_user(request, "未找到东方烟草报网站配置", messages.ERROR)
|
||||
|
||||
# 设置动作的显示名称
|
||||
delete_dongfangyancao_articles.short_description = "删除所有东方烟草报文章"
|
||||
|
||||
def export_as_csv(self, request, queryset):
|
||||
"""导出选中的文章为CSV格式"""
|
||||
meta = self.model._meta
|
||||
field_names = [field.name for field in meta.fields]
|
||||
|
||||
response = HttpResponse(content_type='text/csv')
|
||||
response['Content-Disposition'] = 'attachment; filename={}.csv'.format(meta)
|
||||
writer = csv.writer(response)
|
||||
|
||||
writer.writerow(field_names)
|
||||
for obj in queryset:
|
||||
row = [getattr(obj, field)() if callable(getattr(obj, field)) else getattr(obj, field) for field in
|
||||
field_names]
|
||||
writer.writerow(row)
|
||||
|
||||
return response
|
||||
|
||||
export_as_csv.short_description = "导出选中文章为CSV格式"
|
||||
|
||||
def export_as_json(self, request, queryset):
|
||||
"""导出选中的文章为JSON格式"""
|
||||
response = HttpResponse(content_type='application/json')
|
||||
response['Content-Disposition'] = 'attachment; filename=articles.json'
|
||||
|
||||
# 构造要导出的数据
|
||||
articles_data = []
|
||||
for article in queryset:
|
||||
articles_data.append({
|
||||
'id': article.id,
|
||||
'title': article.title,
|
||||
'website': article.website.name,
|
||||
'url': article.url,
|
||||
'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None,
|
||||
'content': article.content,
|
||||
'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
'media_files': article.media_files
|
||||
})
|
||||
|
||||
# 写入JSON数据
|
||||
response.write(json.dumps(articles_data, ensure_ascii=False, indent=2))
|
||||
return response
|
||||
|
||||
export_as_json.short_description = "导出选中文章为JSON格式"
|
||||
|
||||
def export_as_word(self, request, queryset):
|
||||
"""导出选中的文章为Word格式"""
|
||||
try:
|
||||
from docx import Document
|
||||
from io import BytesIO
|
||||
from docx.shared import Inches
|
||||
except ImportError:
|
||||
self.message_user(request, "缺少python-docx库,请安装: pip install python-docx", messages.ERROR)
|
||||
return
|
||||
|
||||
# 创建Word文档
|
||||
doc = Document()
|
||||
doc.add_heading('文章导出', 0)
|
||||
|
||||
for article in queryset:
|
||||
# 添加文章标题
|
||||
doc.add_heading(article.title, level=1)
|
||||
|
||||
# 添加文章元数据
|
||||
doc.add_paragraph(f"网站: {article.website.name}")
|
||||
doc.add_paragraph(f"URL: {article.url}")
|
||||
doc.add_paragraph(
|
||||
f"发布时间: {article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else 'N/A'}")
|
||||
doc.add_paragraph(f"创建时间: {article.created_at.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
|
||||
# 添加文章内容
|
||||
doc.add_heading('内容', level=2)
|
||||
# 简单处理HTML内容,移除标签并处理图片
|
||||
from bs4 import BeautifulSoup
|
||||
soup = BeautifulSoup(article.content, 'html.parser')
|
||||
|
||||
# 处理内容中的图片
|
||||
for img in soup.find_all('img'):
|
||||
src = img.get('src', '')
|
||||
if src:
|
||||
# 尝试添加图片到文档
|
||||
try:
|
||||
import os
|
||||
from django.conf import settings
|
||||
import requests
|
||||
from io import BytesIO
|
||||
|
||||
# 构建完整的图片路径
|
||||
if src.startswith('http'):
|
||||
# 网络图片
|
||||
response = requests.get(src, timeout=10)
|
||||
image_stream = BytesIO(response.content)
|
||||
doc.add_picture(image_stream, width=Inches(4.0))
|
||||
else:
|
||||
# 本地图片
|
||||
full_path = os.path.join(settings.MEDIA_ROOT, src.lstrip('/'))
|
||||
if os.path.exists(full_path):
|
||||
doc.add_picture(full_path, width=Inches(4.0))
|
||||
except Exception as e:
|
||||
# 如果添加图片失败,添加图片URL作为文本
|
||||
doc.add_paragraph(f"[图片: {src}]")
|
||||
|
||||
# 移除原始img标签
|
||||
img.decompose()
|
||||
|
||||
content_text = soup.get_text()
|
||||
doc.add_paragraph(content_text)
|
||||
|
||||
# 添加媒体文件信息
|
||||
if article.media_files:
|
||||
doc.add_heading('媒体文件', level=2)
|
||||
for media_file in article.media_files:
|
||||
try:
|
||||
import os
|
||||
from django.conf import settings
|
||||
from io import BytesIO
|
||||
import requests
|
||||
|
||||
full_path = os.path.join(settings.MEDIA_ROOT, media_file)
|
||||
if os.path.exists(full_path):
|
||||
# 添加图片到文档
|
||||
doc.add_picture(full_path, width=Inches(4.0))
|
||||
else:
|
||||
# 如果是URL格式的媒体文件
|
||||
if media_file.startswith('http'):
|
||||
response = requests.get(media_file, timeout=10)
|
||||
image_stream = BytesIO(response.content)
|
||||
doc.add_picture(image_stream, width=Inches(4.0))
|
||||
else:
|
||||
doc.add_paragraph(media_file)
|
||||
except Exception as e:
|
||||
doc.add_paragraph(media_file)
|
||||
|
||||
# 添加分页符
|
||||
doc.add_page_break()
|
||||
|
||||
# 保存到内存
|
||||
buffer = BytesIO()
|
||||
doc.save(buffer)
|
||||
buffer.seek(0)
|
||||
|
||||
# 创建HttpResponse
|
||||
from django.http import HttpResponse
|
||||
response = HttpResponse(buffer.getvalue(),
|
||||
content_type='application/vnd.openxmlformats-officedocument.wordprocessingml.document')
|
||||
response['Content-Disposition'] = 'attachment; filename=articles.docx'
|
||||
return response
|
||||
|
||||
export_as_word.short_description = "导出选中文章为Word格式"
|
||||
|
||||
|
||||
# 为不同网站创建专门的文章管理类
|
||||
class NewsCnArticleAdmin(admin.ModelAdmin):
|
||||
list_display = ('title', 'pub_date')
|
||||
search_fields = ('title', 'content')
|
||||
list_filter = ('pub_date',)
|
||||
actions = ['export_as_csv', 'export_as_json']
|
||||
|
||||
def get_queryset(self, request):
|
||||
qs = super().get_queryset(request)
|
||||
# 只显示新华网的文章
|
||||
return qs.filter(website__name='www.news.cn')
|
||||
|
||||
def export_as_csv(self, request, queryset):
|
||||
"""导出选中的文章为CSV格式"""
|
||||
meta = self.model._meta
|
||||
field_names = [field.name for field in meta.fields if field.name != 'content'] # 排除content字段以减小CSV大小
|
||||
|
||||
response = HttpResponse(content_type='text/csv')
|
||||
response['Content-Disposition'] = 'attachment; filename=news_cn_articles.csv'
|
||||
writer = csv.writer(response)
|
||||
|
||||
writer.writerow(field_names)
|
||||
for obj in queryset:
|
||||
row = []
|
||||
for field in field_names:
|
||||
value = getattr(obj, field)
|
||||
if callable(value):
|
||||
value = value()
|
||||
if field == 'website':
|
||||
value = value.name
|
||||
row.append(value)
|
||||
writer.writerow(row)
|
||||
|
||||
return response
|
||||
|
||||
export_as_csv.short_description = "导出选中文章为CSV格式"
|
||||
|
||||
def export_as_json(self, request, queryset):
|
||||
"""导出选中的文章为JSON格式"""
|
||||
response = HttpResponse(content_type='application/json')
|
||||
response['Content-Disposition'] = 'attachment; filename=news_cn_articles.json'
|
||||
|
||||
# 构造要导出的数据
|
||||
articles_data = []
|
||||
for article in queryset:
|
||||
articles_data.append({
|
||||
'id': article.id,
|
||||
'title': article.title,
|
||||
'website': article.website.name,
|
||||
'url': article.url,
|
||||
'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None,
|
||||
'content': article.content,
|
||||
'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
'media_files': article.media_files
|
||||
})
|
||||
|
||||
# 写入JSON数据
|
||||
response.write(json.dumps(articles_data, ensure_ascii=False, indent=2))
|
||||
return response
|
||||
|
||||
export_as_json.short_description = "导出选中文章为JSON格式"
|
||||
|
||||
|
||||
class DongfangyancaoArticleAdmin(admin.ModelAdmin):
|
||||
list_display = ('title', 'pub_date')
|
||||
search_fields = ('title', 'content')
|
||||
list_filter = ('pub_date',)
|
||||
# 添加动作选项
|
||||
actions = ['delete_selected_articles', 'delete_all_articles', 'export_as_csv', 'export_as_json']
|
||||
|
||||
def get_queryset(self, request):
|
||||
qs = super().get_queryset(request)
|
||||
# 只显示东方烟草报的文章
|
||||
return qs.filter(website__name='东方烟草报')
|
||||
|
||||
def delete_all_articles(self, request, queryset):
|
||||
"""删除当前筛选的所有文章(东方烟草报的所有文章)"""
|
||||
# 删除所有东方烟草报的文章
|
||||
deleted_count = self.get_queryset(request).delete()[0]
|
||||
self.message_user(request, f"成功删除 {deleted_count} 篇文章", messages.SUCCESS)
|
||||
|
||||
# 设置动作的显示名称
|
||||
delete_all_articles.short_description = "删除所有当前筛选的文章"
|
||||
|
||||
def export_as_csv(self, request, queryset):
|
||||
"""导出选中的文章为CSV格式"""
|
||||
meta = self.model._meta
|
||||
field_names = [field.name for field in meta.fields if field.name != 'content'] # 排除content字段以减小CSV大小
|
||||
|
||||
response = HttpResponse(content_type='text/csv')
|
||||
response['Content-Disposition'] = 'attachment; filename=dongfangyancao_articles.csv'
|
||||
writer = csv.writer(response)
|
||||
|
||||
writer.writerow(field_names)
|
||||
for obj in queryset:
|
||||
row = []
|
||||
for field in field_names:
|
||||
value = getattr(obj, field)
|
||||
if callable(value):
|
||||
value = value()
|
||||
if field == 'website':
|
||||
value = value.name
|
||||
row.append(value)
|
||||
writer.writerow(row)
|
||||
|
||||
return response
|
||||
|
||||
export_as_csv.short_description = "导出选中文章为CSV格式"
|
||||
|
||||
def export_as_json(self, request, queryset):
|
||||
"""导出选中的文章为JSON格式"""
|
||||
response = HttpResponse(content_type='application/json')
|
||||
response['Content-Disposition'] = 'attachment; filename=dongfangyancao_articles.json'
|
||||
|
||||
# 构造要导出的数据
|
||||
articles_data = []
|
||||
for article in queryset:
|
||||
articles_data.append({
|
||||
'id': article.id,
|
||||
'title': article.title,
|
||||
'website': article.website.name,
|
||||
'url': article.url,
|
||||
'pub_date': article.pub_date.strftime('%Y-%m-%d %H:%M:%S') if article.pub_date else None,
|
||||
'content': article.content,
|
||||
'created_at': article.created_at.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
'media_files': article.media_files
|
||||
})
|
||||
|
||||
# 写入JSON数据
|
||||
response.write(json.dumps(articles_data, ensure_ascii=False, indent=2))
|
||||
return response
|
||||
|
||||
export_as_json.short_description = "导出选中文章为JSON格式"
|
||||
|
||||
|
||||
# 在各自的管理站点中注册模型
|
||||
news_cn_admin.register(Website, WebsiteAdmin)
|
||||
news_cn_admin.register(Article, NewsCnArticleAdmin)
|
||||
|
||||
dongfangyancao_admin.register(Website, WebsiteAdmin)
|
||||
dongfangyancao_admin.register(Article, DongfangyancaoArticleAdmin)
|
||||
|
||||
20
core/management/commands/crawl_dongfangyancao.py
Normal file
20
core/management/commands/crawl_dongfangyancao.py
Normal file
@@ -0,0 +1,20 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
from core.models import Website
|
||||
from core.utils import full_site_crawler
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "全站递归爬取 东方烟草报"
|
||||
|
||||
def handle(self, *args, **kwargs):
|
||||
website, created = Website.objects.get_or_create(
|
||||
name="东方烟草报",
|
||||
defaults={
|
||||
'article_list_url': 'https://www.eastobacco.com/',
|
||||
'article_selector': 'a'
|
||||
}
|
||||
)
|
||||
start_url = "https://www.eastobacco.com/"
|
||||
self.stdout.write(f"开始全站爬取: {start_url}")
|
||||
full_site_crawler(start_url, website, max_pages=500)
|
||||
self.stdout.write("爬取完成")
|
||||
@@ -1,18 +1,20 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
from core.models import Website
|
||||
from core.utils import crawl_xinhua_list
|
||||
from core.utils import full_site_crawler
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = '批量爬取新华网文章'
|
||||
help = "全站递归爬取 www.news.cn"
|
||||
|
||||
def handle(self, *args, **options):
|
||||
list_url = "https://www.news.cn/legal/index.html"
|
||||
try:
|
||||
website = Website.objects.get(base_url="https://www.news.cn/")
|
||||
except Website.DoesNotExist:
|
||||
self.stdout.write(self.style.ERROR("网站 https://www.news.cn/ 不存在,请先后台添加"))
|
||||
return
|
||||
|
||||
self.stdout.write(f"开始爬取文章列表页: {list_url}")
|
||||
crawl_xinhua_list(list_url, website)
|
||||
self.stdout.write(self.style.SUCCESS("批量爬取完成"))
|
||||
def handle(self, *args, **kwargs):
|
||||
website, created = Website.objects.get_or_create(
|
||||
name="www.news.cn",
|
||||
defaults={
|
||||
'article_list_url': 'https://www.news.cn/',
|
||||
'article_selector': 'a'
|
||||
}
|
||||
)
|
||||
start_url = "https://www.news.cn/"
|
||||
self.stdout.write(f"开始全站爬取: {start_url}")
|
||||
full_site_crawler(start_url, website, max_pages=500)
|
||||
self.stdout.write("爬取完成")
|
||||
|
||||
21
core/management/commands/crawl_xinhua_bak.py
Normal file
21
core/management/commands/crawl_xinhua_bak.py
Normal file
@@ -0,0 +1,21 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
from core.models import Website
|
||||
from core.utils import crawl_xinhua_list
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = '批量爬取新华网文章'
|
||||
|
||||
def handle(self, *args, **options):
|
||||
# 添加使用标记,确认该命令是否被调用
|
||||
self.stdout.write(self.style.WARNING("crawl_xinhua command is being used"))
|
||||
|
||||
list_url = "https://www.news.cn/legal/index.html"
|
||||
try:
|
||||
website = Website.objects.get(base_url="https://www.news.cn/")
|
||||
except Website.DoesNotExist:
|
||||
self.stdout.write(self.style.ERROR("网站 https://www.news.cn/ 不存在,请先后台添加"))
|
||||
return
|
||||
|
||||
self.stdout.write(f"开始爬取文章列表页: {list_url}")
|
||||
crawl_xinhua_list(list_url, website)
|
||||
self.stdout.write(self.style.SUCCESS("批量爬取完成"))
|
||||
311
core/management/commands/export_articles.py
Normal file
311
core/management/commands/export_articles.py
Normal file
@@ -0,0 +1,311 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
from core.models import Article, Website
|
||||
import json
|
||||
import csv
|
||||
import os
|
||||
from django.conf import settings
|
||||
from django.core.files.storage import default_storage
|
||||
import zipfile
|
||||
from django.utils import timezone
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = '导出文章及相关的媒体文件(图片、视频等)'
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--format', type=str, default='json', help='导出格式: json 或 csv')
|
||||
parser.add_argument('--website', type=str, help='指定网站名称导出特定网站的文章')
|
||||
parser.add_argument('--output', type=str, default='', help='输出文件路径')
|
||||
parser.add_argument('--include-media', action='store_true', help='包含媒体文件')
|
||||
|
||||
def handle(self, *args, **options):
|
||||
format_type = options['format'].lower()
|
||||
website_name = options['website']
|
||||
output_path = options['output']
|
||||
include_media = options['include_media']
|
||||
|
||||
# 获取文章查询集
|
||||
articles = Article.objects.all()
|
||||
if website_name:
|
||||
try:
|
||||
website = Website.objects.get(name=website_name)
|
||||
articles = articles.filter(website=website)
|
||||
except Website.DoesNotExist:
|
||||
self.stdout.write(self.style.ERROR(f'网站 "{website_name}" 不存在'))
|
||||
return
|
||||
|
||||
if not articles.exists():
|
||||
self.stdout.write(self.style.WARNING('没有找到文章'))
|
||||
return
|
||||
|
||||
# 准备导出数据
|
||||
articles_data = []
|
||||
media_files = []
|
||||
|
||||
for article in articles:
|
||||
article_data = {
|
||||
'id': article.id,
|
||||
'title': article.title,
|
||||
'website': article.website.name,
|
||||
'url': article.url,
|
||||
'pub_date': article.pub_date.isoformat() if article.pub_date else None,
|
||||
'content': article.content,
|
||||
'created_at': article.created_at.isoformat(),
|
||||
'media_files': article.media_files
|
||||
}
|
||||
articles_data.append(article_data)
|
||||
|
||||
# 收集媒体文件路径
|
||||
if include_media:
|
||||
for media_path in article.media_files:
|
||||
full_path = os.path.join(settings.MEDIA_ROOT, media_path)
|
||||
if os.path.exists(full_path):
|
||||
media_files.append(full_path)
|
||||
|
||||
# 确定输出路径
|
||||
if not output_path:
|
||||
timestamp = timezone.now().strftime('%Y%m%d_%H%M%S')
|
||||
if include_media:
|
||||
output_path = f'articles_export_{timestamp}.zip'
|
||||
else:
|
||||
output_path = f'articles_export_{timestamp}.{format_type}'
|
||||
|
||||
# 执行导出
|
||||
if include_media:
|
||||
self.export_with_media(articles_data, media_files, output_path, format_type)
|
||||
else:
|
||||
if format_type == 'json':
|
||||
self.export_as_json(articles_data, output_path)
|
||||
elif format_type == 'csv':
|
||||
self.export_as_csv(articles_data, output_path)
|
||||
# 添加Word格式导出支持
|
||||
elif format_type == 'docx':
|
||||
self.export_as_word(articles_data, output_path)
|
||||
else:
|
||||
self.stdout.write(self.style.ERROR('不支持的格式,仅支持 json、csv 或 docx'))
|
||||
return
|
||||
|
||||
self.stdout.write(self.style.SUCCESS(f'成功导出 {len(articles_data)} 篇文章到 {output_path}'))
|
||||
|
||||
def export_as_json(self, articles_data, output_path):
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(articles_data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
def export_as_csv(self, articles_data, output_path):
|
||||
if not articles_data:
|
||||
return
|
||||
|
||||
# 打开CSV文件
|
||||
with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
|
||||
fieldnames = ['id', 'title', 'website', 'url', 'pub_date', 'content', 'created_at', 'media_files']
|
||||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||
|
||||
writer.writeheader()
|
||||
for article_data in articles_data:
|
||||
# 将列表转换为字符串以便在CSV中存储
|
||||
article_data['media_files'] = ';'.join(article_data['media_files']) if article_data[
|
||||
'media_files'] else ''
|
||||
writer.writerow(article_data)
|
||||
|
||||
# 添加Word格式导出方法
|
||||
def export_as_word(self, articles_data, output_path):
|
||||
try:
|
||||
from docx import Document
|
||||
from docx.shared import Inches
|
||||
except ImportError:
|
||||
self.stdout.write(self.style.ERROR('缺少python-docx库,请安装: pip install python-docx'))
|
||||
return
|
||||
|
||||
# 创建Word文档
|
||||
doc = Document()
|
||||
doc.add_heading('文章导出', 0)
|
||||
|
||||
for article_data in articles_data:
|
||||
# 添加文章标题
|
||||
doc.add_heading(article_data['title'], level=1)
|
||||
|
||||
# 添加文章元数据
|
||||
doc.add_paragraph(f"网站: {article_data['website']}")
|
||||
doc.add_paragraph(f"URL: {article_data['url']}")
|
||||
doc.add_paragraph(f"发布时间: {article_data['pub_date']}")
|
||||
doc.add_paragraph(f"创建时间: {article_data['created_at']}")
|
||||
|
||||
# 添加文章内容
|
||||
doc.add_heading('内容', level=2)
|
||||
# 简单处理HTML内容,移除标签
|
||||
from bs4 import BeautifulSoup
|
||||
soup = BeautifulSoup(article_data['content'], 'html.parser')
|
||||
|
||||
# 处理内容中的图片
|
||||
for img in soup.find_all('img'):
|
||||
src = img.get('src', '')
|
||||
if src:
|
||||
# 尝试添加图片到文档
|
||||
try:
|
||||
import os
|
||||
from django.conf import settings
|
||||
import requests
|
||||
from io import BytesIO
|
||||
|
||||
# 构建完整的图片路径
|
||||
if src.startswith('http'):
|
||||
# 网络图片
|
||||
response = requests.get(src, timeout=10)
|
||||
image_stream = BytesIO(response.content)
|
||||
doc.add_picture(image_stream, width=Inches(4.0))
|
||||
else:
|
||||
# 本地图片
|
||||
full_path = os.path.join(settings.MEDIA_ROOT, src.lstrip('/'))
|
||||
if os.path.exists(full_path):
|
||||
doc.add_picture(full_path, width=Inches(4.0))
|
||||
except Exception as e:
|
||||
# 如果添加图片失败,添加图片URL作为文本
|
||||
doc.add_paragraph(f"[图片: {src}]")
|
||||
|
||||
# 移除原始img标签
|
||||
img.decompose()
|
||||
|
||||
content_text = soup.get_text()
|
||||
doc.add_paragraph(content_text)
|
||||
|
||||
# 添加媒体文件信息
|
||||
if article_data['media_files']:
|
||||
doc.add_heading('媒体文件', level=2)
|
||||
for media_file in article_data['media_files']:
|
||||
try:
|
||||
import os
|
||||
from django.conf import settings
|
||||
from io import BytesIO
|
||||
import requests
|
||||
|
||||
full_path = os.path.join(settings.MEDIA_ROOT, media_file)
|
||||
if os.path.exists(full_path):
|
||||
# 添加图片到文档
|
||||
doc.add_picture(full_path, width=Inches(4.0))
|
||||
else:
|
||||
# 如果是URL格式的媒体文件
|
||||
if media_file.startswith('http'):
|
||||
response = requests.get(media_file, timeout=10)
|
||||
image_stream = BytesIO(response.content)
|
||||
doc.add_picture(image_stream, width=Inches(4.0))
|
||||
else:
|
||||
doc.add_paragraph(media_file)
|
||||
except Exception as e:
|
||||
doc.add_paragraph(media_file)
|
||||
|
||||
# 添加分页符
|
||||
doc.add_page_break()
|
||||
|
||||
# 保存文档
|
||||
doc.save(output_path)
|
||||
|
||||
def export_with_media(self, articles_data, media_files, output_path, format_type):
|
||||
# 创建ZIP文件
|
||||
with zipfile.ZipFile(output_path, 'w') as zipf:
|
||||
# 添加文章数据文件
|
||||
data_filename = f'articles.{format_type}'
|
||||
if format_type == 'json':
|
||||
json_data = json.dumps(articles_data, ensure_ascii=False, indent=2)
|
||||
zipf.writestr(data_filename, json_data)
|
||||
elif format_type == 'csv':
|
||||
# 创建CSV内容
|
||||
if articles_data:
|
||||
import io
|
||||
csv_buffer = io.StringIO()
|
||||
fieldnames = ['id', 'title', 'website', 'url', 'pub_date', 'content', 'created_at', 'media_files']
|
||||
writer = csv.DictWriter(csv_buffer, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
for article_data in articles_data:
|
||||
article_data['media_files'] = ';'.join(article_data['media_files']) if article_data[
|
||||
'media_files'] else ''
|
||||
writer.writerow(article_data)
|
||||
zipf.writestr(data_filename, csv_buffer.getvalue())
|
||||
# 添加Word格式支持
|
||||
elif format_type == 'docx':
|
||||
# 创建Word文档并保存到ZIP
|
||||
try:
|
||||
from docx import Document
|
||||
from docx.shared import Inches
|
||||
from io import BytesIO
|
||||
|
||||
doc = Document()
|
||||
doc.add_heading('文章导出', 0)
|
||||
|
||||
for article_data in articles_data:
|
||||
doc.add_heading(article_data['title'], level=1)
|
||||
doc.add_paragraph(f"网站: {article_data['website']}")
|
||||
doc.add_paragraph(f"URL: {article_data['url']}")
|
||||
doc.add_paragraph(f"发布时间: {article_data['pub_date']}")
|
||||
doc.add_paragraph(f"创建时间: {article_data['created_at']}")
|
||||
|
||||
doc.add_heading('内容', level=2)
|
||||
from bs4 import BeautifulSoup
|
||||
soup = BeautifulSoup(article_data['content'], 'html.parser')
|
||||
|
||||
# 处理内容中的图片
|
||||
for img in soup.find_all('img'):
|
||||
src = img.get('src', '')
|
||||
if src:
|
||||
# 尝试添加图片到文档
|
||||
try:
|
||||
import os
|
||||
from django.conf import settings
|
||||
import requests
|
||||
|
||||
# 构建完整的图片路径
|
||||
if src.startswith('http'):
|
||||
# 网络图片
|
||||
response = requests.get(src, timeout=10)
|
||||
image_stream = BytesIO(response.content)
|
||||
doc.add_picture(image_stream, width=Inches(4.0))
|
||||
else:
|
||||
# 本地图片
|
||||
full_path = os.path.join(settings.MEDIA_ROOT, src.lstrip('/'))
|
||||
if os.path.exists(full_path):
|
||||
doc.add_picture(full_path, width=Inches(4.0))
|
||||
except Exception as e:
|
||||
# 如果添加图片失败,添加图片URL作为文本
|
||||
doc.add_paragraph(f"[图片: {src}]")
|
||||
|
||||
# 移除原始img标签
|
||||
img.decompose()
|
||||
|
||||
content_text = soup.get_text()
|
||||
doc.add_paragraph(content_text)
|
||||
|
||||
if article_data['media_files']:
|
||||
doc.add_heading('媒体文件', level=2)
|
||||
for media_file in article_data['media_files']:
|
||||
try:
|
||||
import os
|
||||
from django.conf import settings
|
||||
|
||||
full_path = os.path.join(settings.MEDIA_ROOT, media_file)
|
||||
if os.path.exists(full_path):
|
||||
# 添加图片到文档
|
||||
doc.add_picture(full_path, width=Inches(4.0))
|
||||
else:
|
||||
# 如果是URL格式的媒体文件
|
||||
if media_file.startswith('http'):
|
||||
response = requests.get(media_file, timeout=10)
|
||||
image_stream = BytesIO(response.content)
|
||||
doc.add_picture(image_stream, width=Inches(4.0))
|
||||
else:
|
||||
doc.add_paragraph(media_file)
|
||||
except Exception as e:
|
||||
doc.add_paragraph(media_file)
|
||||
|
||||
doc.add_page_break()
|
||||
|
||||
# 将文档保存到内存中再写入ZIP
|
||||
doc_buffer = BytesIO()
|
||||
doc.save(doc_buffer)
|
||||
doc_buffer.seek(0)
|
||||
zipf.writestr(data_filename, doc_buffer.read())
|
||||
except ImportError:
|
||||
zipf.writestr(data_filename, "错误:缺少python-docx库,无法生成Word文档")
|
||||
|
||||
# 添加媒体文件
|
||||
for media_path in media_files:
|
||||
arcname = os.path.join('media', os.path.relpath(media_path, settings.MEDIA_ROOT))
|
||||
zipf.write(media_path, arcname)
|
||||
@@ -1,5 +1,6 @@
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Website(models.Model):
|
||||
name = models.CharField(max_length=100, unique=True)
|
||||
base_url = models.URLField()
|
||||
|
||||
@@ -1,17 +1,85 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="zh">
|
||||
<head>
|
||||
<meta charset="UTF-8" />
|
||||
<meta charset="UTF-8"/>
|
||||
<title>{{ article.title }}</title>
|
||||
<style>
|
||||
body {
|
||||
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
|
||||
line-height: 1.6;
|
||||
color: #333;
|
||||
max-width: 1200px; /* 修改:同步调整页面最大宽度与列表页一致 */
|
||||
margin: 0 auto;
|
||||
padding: 20px;
|
||||
background-color: #f8f9fa;
|
||||
}
|
||||
|
||||
.article-container {
|
||||
background: white;
|
||||
border-radius: 8px;
|
||||
box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
|
||||
padding: 30px;
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
|
||||
h1 {
|
||||
color: #2c3e50;
|
||||
border-bottom: 2px solid #3498db;
|
||||
padding-bottom: 10px;
|
||||
margin-top: 0;
|
||||
}
|
||||
|
||||
.meta {
|
||||
color: #7f8c8d;
|
||||
font-size: 0.9em;
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
|
||||
hr {
|
||||
border: 0;
|
||||
height: 1px;
|
||||
background: #ecf0f1;
|
||||
margin: 20px 0;
|
||||
}
|
||||
|
||||
.content {
|
||||
font-size: 16px;
|
||||
}
|
||||
|
||||
.content img {
|
||||
max-width: 100%;
|
||||
height: auto;
|
||||
border-radius: 4px;
|
||||
margin: 10px 0;
|
||||
}
|
||||
|
||||
.back-link {
|
||||
display: inline-block;
|
||||
padding: 10px 20px;
|
||||
background-color: #3498db;
|
||||
color: white;
|
||||
text-decoration: none;
|
||||
border-radius: 4px;
|
||||
transition: background-color 0.3s;
|
||||
}
|
||||
|
||||
.back-link:hover {
|
||||
background-color: #2980b9;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="article-container">
|
||||
<h1>{{ article.title }}</h1>
|
||||
<div class="meta">
|
||||
<p>发布时间: {{ article.pub_date|date:"Y-m-d H:i" }}</p>
|
||||
<hr />
|
||||
<div>
|
||||
</div>
|
||||
<hr/>
|
||||
<div class="content">
|
||||
{{ article.content|safe }}
|
||||
</div>
|
||||
<hr />
|
||||
<p><a href="{% url 'article_list' %}">返回列表</a></p>
|
||||
<hr/>
|
||||
<p><a href="{% url 'article_list' %}" class="back-link">← 返回列表</a></p>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
@@ -1,17 +1,200 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="zh">
|
||||
<head>
|
||||
<meta charset="UTF-8" />
|
||||
<meta charset="UTF-8"/>
|
||||
<title>绿色课堂文章列表</title>
|
||||
<style>
|
||||
body {
|
||||
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
|
||||
line-height: 1.6;
|
||||
color: #333;
|
||||
max-width: 1200px; /* 修改:增加页面最大宽度 */
|
||||
margin: 0 auto;
|
||||
padding: 20px;
|
||||
background-color: #f8f9fa;
|
||||
}
|
||||
|
||||
.container {
|
||||
background: white;
|
||||
border-radius: 8px;
|
||||
box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
|
||||
padding: 30px;
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
|
||||
h1 {
|
||||
color: #2c3e50;
|
||||
border-bottom: 2px solid #3498db;
|
||||
padding-bottom: 10px;
|
||||
margin-top: 0;
|
||||
}
|
||||
|
||||
.filters {
|
||||
margin-bottom: 20px;
|
||||
padding: 15px;
|
||||
background-color: #f1f8ff;
|
||||
border-radius: 5px;
|
||||
}
|
||||
|
||||
.filters a {
|
||||
display: inline-block;
|
||||
padding: 5px 10px;
|
||||
margin: 0 5px 5px 0;
|
||||
background-color: #e1e8ed;
|
||||
color: #333;
|
||||
text-decoration: none;
|
||||
border-radius: 3px;
|
||||
}
|
||||
|
||||
.filters a.active {
|
||||
background-color: #3498db;
|
||||
color: white;
|
||||
}
|
||||
|
||||
ul {
|
||||
list-style: none;
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
li {
|
||||
padding: 10px 0;
|
||||
border-bottom: 1px solid #ecf0f1;
|
||||
}
|
||||
|
||||
li:last-child {
|
||||
border-bottom: none;
|
||||
}
|
||||
|
||||
a {
|
||||
color: #3498db;
|
||||
text-decoration: none;
|
||||
}
|
||||
|
||||
a:hover {
|
||||
color: #2980b9;
|
||||
text-decoration: underline;
|
||||
}
|
||||
|
||||
.meta {
|
||||
color: #7f8c8d;
|
||||
font-size: 0.9em;
|
||||
}
|
||||
|
||||
.pagination {
|
||||
margin-top: 30px;
|
||||
text-align: center;
|
||||
padding: 20px 0;
|
||||
}
|
||||
|
||||
.pagination a {
|
||||
display: inline-block;
|
||||
padding: 8px 16px;
|
||||
background-color: #3498db;
|
||||
color: white;
|
||||
text-decoration: none;
|
||||
border-radius: 4px;
|
||||
margin: 0 2px; /* 修改:调整页码间距 */
|
||||
}
|
||||
|
||||
.pagination a:hover {
|
||||
background-color: #2980b9;
|
||||
}
|
||||
|
||||
.pagination span {
|
||||
margin: 0 10px;
|
||||
color: #7f8c8d;
|
||||
}
|
||||
|
||||
/* 新增:当前页码样式 */
|
||||
.pagination .current {
|
||||
background-color: #2980b9;
|
||||
cursor: default;
|
||||
}
|
||||
|
||||
/* 新增:省略号样式 */
|
||||
.pagination .ellipsis {
|
||||
display: inline-block;
|
||||
padding: 8px 4px;
|
||||
color: #7f8c8d;
|
||||
}
|
||||
|
||||
/* 新增:搜索框样式 */
|
||||
.search-form {
|
||||
margin-bottom: 20px;
|
||||
padding: 15px;
|
||||
background-color: #f1f8ff;
|
||||
border-radius: 5px;
|
||||
}
|
||||
|
||||
.search-form input[type="text"] {
|
||||
padding: 8px 12px;
|
||||
border: 1px solid #ddd;
|
||||
border-radius: 4px;
|
||||
width: 300px;
|
||||
margin-right: 10px;
|
||||
}
|
||||
|
||||
.search-form input[type="submit"] {
|
||||
padding: 8px 16px;
|
||||
background-color: #3498db;
|
||||
color: white;
|
||||
border: none;
|
||||
border-radius: 4px;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
.search-form input[type="submit"]:hover {
|
||||
background-color: #2980b9;
|
||||
}
|
||||
|
||||
.search-info {
|
||||
color: #7f8c8d;
|
||||
font-size: 0.9em;
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<h1>绿色课堂文章列表</h1>
|
||||
|
||||
<!-- 新增:返回首页链接 -->
|
||||
<div style="margin-bottom: 20px;">
|
||||
<a href="{% url 'article_list' %}" style="color: #3498db; text-decoration: none;">← 返回首页</a>
|
||||
</div>
|
||||
|
||||
<!-- 新增:搜索表单 -->
|
||||
<div class="search-form">
|
||||
<form method="get">
|
||||
<input type="text" name="q" placeholder="输入关键词搜索文章..." value="{{ search_query }}">
|
||||
{% if selected_website %}
|
||||
<input type="hidden" name="website" value="{{ selected_website.id }}">
|
||||
{% endif %}
|
||||
<input type="submit" value="搜索">
|
||||
</form>
|
||||
</div>
|
||||
|
||||
<div class="filters">
|
||||
<strong>按网站筛选:</strong>
|
||||
<a href="{% url 'article_list' %}{% if search_query %}?q={{ search_query }}{% endif %}" {% if not selected_website %}class="active" {% endif %}>全部</a>
|
||||
{% for website in websites %}
|
||||
<a href="?website={{ website.id }}{% if search_query %}&q={{ search_query }}{% endif %}" {% if selected_website and selected_website.id == website.id %}class="active" {% endif %}>{{ website.name }}</a>
|
||||
{% endfor %}
|
||||
</div>
|
||||
|
||||
<!-- 新增:搜索结果信息 -->
|
||||
{% if search_query %}
|
||||
<div class="search-info">
|
||||
搜索 "{{ search_query }}" 找到 {{ page_obj.paginator.count }} 篇文章
|
||||
<a href="{% if selected_website %}?website={{ selected_website.id }}{% else %}{% url 'article_list' %}{% endif %}">清除搜索</a>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
<ul>
|
||||
{% for article in page_obj %}
|
||||
<li>
|
||||
<a href="{% url 'article_detail' article.id %}">{{ article.title }}</a>
|
||||
({{ article.created_at|date:"Y-m-d" }})
|
||||
<div class="meta">({{ article.website.name }} - {{ article.created_at|date:"Y-m-d" }})</div>
|
||||
</li>
|
||||
{% empty %}
|
||||
<li>暂无文章</li>
|
||||
@@ -20,14 +203,50 @@
|
||||
|
||||
<div class="pagination">
|
||||
{% if page_obj.has_previous %}
|
||||
<a href="?page={{ page_obj.previous_page_number }}">上一页</a>
|
||||
{% if selected_website %}
|
||||
<a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page=1">« 首页</a>
|
||||
<a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.previous_page_number }}">上一页</a>
|
||||
{% else %}
|
||||
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page=1">« 首页</a>
|
||||
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.previous_page_number }}">上一页</a>
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
|
||||
<span>第 {{ page_obj.number }} 页,共 {{ page_obj.paginator.num_pages }} 页</span>
|
||||
|
||||
<!-- 修改:优化页码显示逻辑 -->
|
||||
{% with page_obj.paginator as paginator %}
|
||||
{% for num in paginator.page_range %}
|
||||
{% if page_obj.number == num %}
|
||||
<a href="#" class="current">{{ num }}</a>
|
||||
{% elif num > page_obj.number|add:'-3' and num < page_obj.number|add:'3' %}
|
||||
{% if selected_website %}
|
||||
<a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ num }}">{{ num }}</a>
|
||||
{% else %}
|
||||
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ num }}">{{ num }}</a>
|
||||
{% endif %}
|
||||
{% elif num == 1 or num == paginator.num_pages %}
|
||||
{% if selected_website %}
|
||||
<a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ num }}">{{ num }}</a>
|
||||
{% else %}
|
||||
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ num }}">{{ num }}</a>
|
||||
{% endif %}
|
||||
{% elif num == page_obj.number|add:'-3' or num == page_obj.number|add:'3' %}
|
||||
<span class="ellipsis">...</span>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endwith %}
|
||||
|
||||
{% if page_obj.has_next %}
|
||||
<a href="?page={{ page_obj.next_page_number }}">下一页</a>
|
||||
{% if selected_website %}
|
||||
<a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.next_page_number }}">下一页</a>
|
||||
<a href="?website={{ selected_website.id }}{% if search_query %}&q={{ search_query }}{% endif %}&page={{ page_obj.paginator.num_pages }}">末页 »</a>
|
||||
{% else %}
|
||||
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.next_page_number }}">下一页</a>
|
||||
<a href="?{% if search_query %}q={{ search_query }}&{% endif %}page={{ page_obj.paginator.num_pages }}">末页 »</a>
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
169
core/utils.py
169
core/utils.py
@@ -1,20 +1,50 @@
|
||||
# core/utils.py
|
||||
import os
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urljoin, urlparse
|
||||
from collections import deque
|
||||
from django.utils import timezone
|
||||
from django.conf import settings
|
||||
from core.models import Article
|
||||
import re
|
||||
|
||||
|
||||
def download_media(url, save_dir):
|
||||
try:
|
||||
resp = requests.get(url, timeout=15)
|
||||
# 添加请求头以避免403 Forbidden错误
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
||||
"Referer": urljoin(url, "/")
|
||||
}
|
||||
resp = requests.get(url, timeout=15, headers=headers)
|
||||
resp.raise_for_status()
|
||||
except Exception as e:
|
||||
print(f"下载失败:{url},错误:{e}")
|
||||
return None
|
||||
|
||||
filename = url.split("/")[-1].split("?")[0]
|
||||
# 更安全地处理文件名,去除查询参数并处理特殊字符
|
||||
parsed_url = urlparse(url)
|
||||
filename = os.path.basename(parsed_url.path)
|
||||
if not filename or '.' not in filename:
|
||||
# 如果URL路径中没有有效的文件名,使用默认名称
|
||||
filename = 'media_file'
|
||||
|
||||
# 清理文件名中的特殊字符
|
||||
filename = re.sub(r'[^\w\-_\.]', '_', filename)
|
||||
|
||||
# 确保文件有扩展名
|
||||
if '.' not in filename:
|
||||
content_type = resp.headers.get('content-type', '')
|
||||
if 'image/jpeg' in content_type:
|
||||
filename += '.jpg'
|
||||
elif 'image/png' in content_type:
|
||||
filename += '.png'
|
||||
elif 'image/gif' in content_type:
|
||||
filename += '.gif'
|
||||
else:
|
||||
filename += '.bin' # 默认二进制扩展名
|
||||
|
||||
os.makedirs(save_dir, exist_ok=True)
|
||||
filepath = os.path.join(save_dir, filename)
|
||||
|
||||
@@ -27,22 +57,51 @@ def download_media(url, save_dir):
|
||||
|
||||
with open(filepath, "wb") as f:
|
||||
f.write(resp.content)
|
||||
return filepath
|
||||
|
||||
# 返回相对路径,方便存数据库和展示
|
||||
return os.path.relpath(filepath, settings.MEDIA_ROOT).replace("\\", "/")
|
||||
|
||||
def crawl_xinhua_article(url, website):
|
||||
def process_article(url, website):
|
||||
if Article.objects.filter(url=url).exists():
|
||||
print(f"文章已存在,跳过: {url}")
|
||||
return
|
||||
|
||||
headers = {"User-Agent": "Mozilla/5.0"}
|
||||
resp = requests.get(url, headers=headers)
|
||||
resp.encoding = 'utf-8'
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
|
||||
# 处理不同网站的文章结构
|
||||
if website.name == "www.news.cn":
|
||||
title_tag = soup.find("span", class_="title")
|
||||
content_tag = soup.find("span", id="detailContent")
|
||||
elif website.name == "东方烟草报":
|
||||
# 优化东方烟草报的标题提取逻辑,按优先级尝试多种选择器
|
||||
title_tag = (
|
||||
soup.find("h1", id="title") or # 特别针对带id="title"的h1标签
|
||||
soup.find("h1") or # 主要标题标签
|
||||
soup.find("title") or # 页面title标签
|
||||
soup.find("div", class_="title") or # 某些页面可能使用div.title
|
||||
soup.find("h2") # 备选标题标签
|
||||
)
|
||||
content_tag = soup.find("div", class_="content") # 东方烟草报的内容通常在div.content中
|
||||
# 增加对另一种内容结构的支持
|
||||
if not content_tag:
|
||||
content_tag = soup.find("div", id="gallery")
|
||||
# 再增加对新内容结构的支持
|
||||
if not content_tag:
|
||||
content_tag = soup.find("div", id="ContentText")
|
||||
else:
|
||||
# 默认处理方式
|
||||
title_tag = soup.find("h1") or soup.find("title")
|
||||
content_tag = soup.find("div", class_="content") or soup.find("div", id="content")
|
||||
|
||||
title = title_tag.get_text(strip=True) if title_tag else "无标题"
|
||||
|
||||
content_tag = soup.find("span", id="detailContent")
|
||||
# 对标题进行额外处理,去除可能的多余空白字符
|
||||
title = title.strip() if title else "无标题"
|
||||
|
||||
if not content_tag:
|
||||
print(f"没有找到正文,跳过文章: {url}")
|
||||
print("没有找到正文,跳过:", url)
|
||||
return
|
||||
|
||||
imgs = content_tag.find_all("img")
|
||||
@@ -56,22 +115,16 @@ def crawl_xinhua_article(url, website):
|
||||
src = img.get("src")
|
||||
if not src:
|
||||
continue
|
||||
|
||||
# 这里用文章URL作为基准拼接相对路径,避免错误
|
||||
if not src.startswith("http"):
|
||||
src = urljoin(url, src)
|
||||
|
||||
local_rel_path = download_media(src, save_dir)
|
||||
if local_rel_path:
|
||||
img["src"] = settings.MEDIA_URL + local_rel_path
|
||||
media_files.append(local_rel_path)
|
||||
local_path = download_media(src, save_dir)
|
||||
if local_path:
|
||||
rel_path = os.path.relpath(local_path, settings.MEDIA_ROOT)
|
||||
img["src"] = settings.MEDIA_URL + rel_path.replace("\\", "/")
|
||||
media_files.append(rel_path.replace("\\", "/"))
|
||||
|
||||
content_html = str(content_tag)
|
||||
|
||||
if Article.objects.filter(url=url).exists():
|
||||
print(f"文章已存在,跳过: {url}")
|
||||
return
|
||||
|
||||
article = Article.objects.create(
|
||||
website=website,
|
||||
title=title,
|
||||
@@ -82,22 +135,74 @@ def crawl_xinhua_article(url, website):
|
||||
)
|
||||
print(f"已保存文章及图片:{title}")
|
||||
|
||||
def crawl_xinhua_list(list_url, website):
|
||||
|
||||
def is_valid_url(url, base_netloc):
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
if parsed.scheme not in ("http", "https"):
|
||||
return False
|
||||
if parsed.netloc != base_netloc:
|
||||
return False
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def full_site_crawler(start_url, website, max_pages=1000):
|
||||
headers = {"User-Agent": "Mozilla/5.0"}
|
||||
resp = requests.get(list_url, headers=headers)
|
||||
visited = set()
|
||||
queue = deque([start_url])
|
||||
|
||||
base_netloc = urlparse(start_url).netloc
|
||||
|
||||
pages_crawled = 0
|
||||
|
||||
while queue and pages_crawled < max_pages:
|
||||
url = queue.popleft()
|
||||
if url in visited:
|
||||
continue
|
||||
print(f"正在爬取:{url}")
|
||||
visited.add(url)
|
||||
|
||||
try:
|
||||
resp = requests.get(url, headers=headers, timeout=15)
|
||||
resp.raise_for_status()
|
||||
except Exception as e:
|
||||
print(f"请求失败:{url},错误:{e}")
|
||||
continue
|
||||
|
||||
resp.encoding = 'utf-8'
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
|
||||
article_urls = set()
|
||||
# 根据不同网站判断文章页面
|
||||
is_article_page = False
|
||||
if website.name == "www.news.cn":
|
||||
is_article_page = soup.find("span", id="detailContent") is not None
|
||||
elif website.name == "东方烟草报":
|
||||
# 对于东方烟草报,我们增加基于URL模式的判断
|
||||
# 东方烟草报的文章URL通常包含/content/和日期格式
|
||||
parsed_url = urlparse(url)
|
||||
path = parsed_url.path
|
||||
is_article_page = (
|
||||
soup.find("div", class_="content") is not None or
|
||||
soup.find("div", id="gallery") is not None or
|
||||
soup.find("div", id="ContentText") is not None or
|
||||
("/content/" in path and len(path) > 20)
|
||||
)
|
||||
else:
|
||||
# 默认判断逻辑
|
||||
is_article_page = (
|
||||
soup.find("div", class_="content") is not None or
|
||||
soup.find("div", id="content") is not None
|
||||
)
|
||||
|
||||
# 如果是文章页面,则调用文章处理
|
||||
if is_article_page:
|
||||
process_article(url, website)
|
||||
pages_crawled += 1
|
||||
|
||||
# 扩展队列,发现新链接
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
if href.startswith("https://www.news.cn/legal/") and href.endswith("c.html"):
|
||||
article_urls.add(href)
|
||||
|
||||
print(f"在列表页找到 {len(article_urls)} 篇文章链接")
|
||||
for url in article_urls:
|
||||
print("文章链接:", url)
|
||||
|
||||
from core.utils import crawl_xinhua_article
|
||||
for article_url in article_urls:
|
||||
crawl_xinhua_article(article_url, website)
|
||||
href = urljoin(url, link["href"])
|
||||
if href not in visited and is_valid_url(href, base_netloc):
|
||||
queue.append(href)
|
||||
|
||||
@@ -1,28 +1,44 @@
|
||||
from django.shortcuts import render, get_object_or_404
|
||||
from django.shortcuts import render
|
||||
from django.core.paginator import Paginator
|
||||
from .models import Article
|
||||
from .models import Article, Website
|
||||
|
||||
def article_list(request):
|
||||
"""
|
||||
显示文章列表的视图函数
|
||||
"""
|
||||
articles = Article.objects.all().order_by('-created_at')
|
||||
paginator = Paginator(articles, 20) # 每页显示10篇文章
|
||||
# 获取所有启用的网站
|
||||
websites = Website.objects.filter(enabled=True)
|
||||
|
||||
# 获取筛选网站
|
||||
selected_website = None
|
||||
articles = Article.objects.all()
|
||||
|
||||
website_id = request.GET.get('website')
|
||||
if website_id:
|
||||
try:
|
||||
selected_website = Website.objects.get(id=website_id)
|
||||
articles = articles.filter(website=selected_website)
|
||||
except Website.DoesNotExist:
|
||||
pass
|
||||
|
||||
# 新增:处理关键词搜索
|
||||
search_query = request.GET.get('q')
|
||||
if search_query:
|
||||
articles = articles.filter(title__icontains=search_query)
|
||||
|
||||
# 按创建时间倒序排列
|
||||
articles = articles.order_by('-created_at')
|
||||
|
||||
# 分页
|
||||
paginator = Paginator(articles, 10) # 每页显示10篇文章
|
||||
page_number = request.GET.get('page')
|
||||
page_obj = paginator.get_page(page_number)
|
||||
|
||||
return render(request, 'core/article_list.html', {
|
||||
'page_obj': page_obj
|
||||
'page_obj': page_obj,
|
||||
'websites': websites,
|
||||
'selected_website': selected_website,
|
||||
# 新增:传递搜索关键词到模板
|
||||
'search_query': search_query
|
||||
})
|
||||
|
||||
def article_detail(request, article_id):
|
||||
"""
|
||||
显示文章详情的视图函数
|
||||
"""
|
||||
article = get_object_or_404(Article, id=article_id)
|
||||
return render(request, 'core/article_detail.html', {
|
||||
'article': article
|
||||
})
|
||||
|
||||
# Create your views here.
|
||||
article = Article.objects.get(id=article_id)
|
||||
return render(request, 'core/article_detail.html', {'article': article})
|
||||
|
||||
@@ -1,10 +1,15 @@
|
||||
from django.contrib import admin
|
||||
from django.urls import path, include
|
||||
from django.conf import settings
|
||||
from django.conf.urls.static import static
|
||||
from django.contrib import admin
|
||||
from django.urls import path, include
|
||||
|
||||
# 需要导入自定义的管理站点实例
|
||||
from core.admin import news_cn_admin, dongfangyancao_admin
|
||||
|
||||
urlpatterns = [
|
||||
path('admin/', admin.site.urls),
|
||||
path('news_cn_admin/', news_cn_admin.urls),
|
||||
path('dongfangyancao_admin/', dongfangyancao_admin.urls),
|
||||
# 以后前台访问放 core app 的 urls
|
||||
path('', include('core.urls')),
|
||||
]
|
||||
|
||||
31
requirements.txt
Normal file
31
requirements.txt
Normal file
@@ -0,0 +1,31 @@
|
||||
asgiref==3.9.1
|
||||
asttokens==3.0.0
|
||||
beautifulsoup4==4.13.4
|
||||
bs4==0.0.2
|
||||
certifi==2025.8.3
|
||||
charset-normalizer==3.4.3
|
||||
decorator==5.2.1
|
||||
Django==5.1
|
||||
executing==2.2.0
|
||||
idna==3.10
|
||||
ipython==9.4.0
|
||||
ipython_pygments_lexers==1.1.1
|
||||
jedi==0.19.2
|
||||
lxml==6.0.0
|
||||
matplotlib-inline==0.1.7
|
||||
parso==0.8.4
|
||||
pexpect==4.9.0
|
||||
prompt_toolkit==3.0.51
|
||||
ptyprocess==0.7.0
|
||||
pure_eval==0.2.3
|
||||
Pygments==2.19.2
|
||||
python-docx==1.2.0
|
||||
requests==2.32.4
|
||||
soupsieve==2.7
|
||||
sqlparse==0.5.3
|
||||
stack-data==0.6.3
|
||||
traitlets==5.14.3
|
||||
typing_extensions==4.14.1
|
||||
urllib3==2.5.0
|
||||
uv==0.8.8
|
||||
wcwidth==0.2.13
|
||||
Reference in New Issue
Block a user