Support export for Word

This commit is contained in:
2025-08-11 23:14:56 +08:00
parent bfd1604872
commit b6bbb90703
2 changed files with 284 additions and 2 deletions

View File

@@ -78,8 +78,11 @@ class Command(BaseCommand):
self.export_as_json(articles_data, output_path)
elif format_type == 'csv':
self.export_as_csv(articles_data, output_path)
# 添加Word格式导出支持
elif format_type == 'docx':
self.export_as_word(articles_data, output_path)
else:
self.stdout.write(self.style.ERROR('不支持的格式,仅支持 json 或 csv'))
self.stdout.write(self.style.ERROR('不支持的格式,仅支持 json、csv 或 docx'))
return
self.stdout.write(self.style.SUCCESS(f'成功导出 {len(articles_data)} 篇文章到 {output_path}'))
@@ -103,6 +106,98 @@ class Command(BaseCommand):
article_data['media_files'] = ';'.join(article_data['media_files']) if article_data['media_files'] else ''
writer.writerow(article_data)
# 添加Word格式导出方法
def export_as_word(self, articles_data, output_path):
try:
from docx import Document
from docx.shared import Inches
except ImportError:
self.stdout.write(self.style.ERROR('缺少python-docx库请安装: pip install python-docx'))
return
# 创建Word文档
doc = Document()
doc.add_heading('文章导出', 0)
for article_data in articles_data:
# 添加文章标题
doc.add_heading(article_data['title'], level=1)
# 添加文章元数据
doc.add_paragraph(f"网站: {article_data['website']}")
doc.add_paragraph(f"URL: {article_data['url']}")
doc.add_paragraph(f"发布时间: {article_data['pub_date']}")
doc.add_paragraph(f"创建时间: {article_data['created_at']}")
# 添加文章内容
doc.add_heading('内容', level=2)
# 简单处理HTML内容移除标签
from bs4 import BeautifulSoup
soup = BeautifulSoup(article_data['content'], 'html.parser')
# 处理内容中的图片
for img in soup.find_all('img'):
src = img.get('src', '')
if src:
# 尝试添加图片到文档
try:
import os
from django.conf import settings
import requests
from io import BytesIO
# 构建完整的图片路径
if src.startswith('http'):
# 网络图片
response = requests.get(src, timeout=10)
image_stream = BytesIO(response.content)
doc.add_picture(image_stream, width=Inches(4.0))
else:
# 本地图片
full_path = os.path.join(settings.MEDIA_ROOT, src.lstrip('/'))
if os.path.exists(full_path):
doc.add_picture(full_path, width=Inches(4.0))
except Exception as e:
# 如果添加图片失败添加图片URL作为文本
doc.add_paragraph(f"[图片: {src}]")
# 移除原始img标签
img.decompose()
content_text = soup.get_text()
doc.add_paragraph(content_text)
# 添加媒体文件信息
if article_data['media_files']:
doc.add_heading('媒体文件', level=2)
for media_file in article_data['media_files']:
try:
import os
from django.conf import settings
from io import BytesIO
import requests
full_path = os.path.join(settings.MEDIA_ROOT, media_file)
if os.path.exists(full_path):
# 添加图片到文档
doc.add_picture(full_path, width=Inches(4.0))
else:
# 如果是URL格式的媒体文件
if media_file.startswith('http'):
response = requests.get(media_file, timeout=10)
image_stream = BytesIO(response.content)
doc.add_picture(image_stream, width=Inches(4.0))
else:
doc.add_paragraph(media_file)
except Exception as e:
doc.add_paragraph(media_file)
# 添加分页符
doc.add_page_break()
# 保存文档
doc.save(output_path)
def export_with_media(self, articles_data, media_files, output_path, format_type):
# 创建ZIP文件
with zipfile.ZipFile(output_path, 'w') as zipf:
@@ -123,6 +218,90 @@ class Command(BaseCommand):
article_data['media_files'] = ';'.join(article_data['media_files']) if article_data['media_files'] else ''
writer.writerow(article_data)
zipf.writestr(data_filename, csv_buffer.getvalue())
# 添加Word格式支持
elif format_type == 'docx':
# 创建Word文档并保存到ZIP
try:
from docx import Document
from docx.shared import Inches
from io import BytesIO
doc = Document()
doc.add_heading('文章导出', 0)
for article_data in articles_data:
doc.add_heading(article_data['title'], level=1)
doc.add_paragraph(f"网站: {article_data['website']}")
doc.add_paragraph(f"URL: {article_data['url']}")
doc.add_paragraph(f"发布时间: {article_data['pub_date']}")
doc.add_paragraph(f"创建时间: {article_data['created_at']}")
doc.add_heading('内容', level=2)
from bs4 import BeautifulSoup
soup = BeautifulSoup(article_data['content'], 'html.parser')
# 处理内容中的图片
for img in soup.find_all('img'):
src = img.get('src', '')
if src:
# 尝试添加图片到文档
try:
import os
from django.conf import settings
import requests
# 构建完整的图片路径
if src.startswith('http'):
# 网络图片
response = requests.get(src, timeout=10)
image_stream = BytesIO(response.content)
doc.add_picture(image_stream, width=Inches(4.0))
else:
# 本地图片
full_path = os.path.join(settings.MEDIA_ROOT, src.lstrip('/'))
if os.path.exists(full_path):
doc.add_picture(full_path, width=Inches(4.0))
except Exception as e:
# 如果添加图片失败添加图片URL作为文本
doc.add_paragraph(f"[图片: {src}]")
# 移除原始img标签
img.decompose()
content_text = soup.get_text()
doc.add_paragraph(content_text)
if article_data['media_files']:
doc.add_heading('媒体文件', level=2)
for media_file in article_data['media_files']:
try:
import os
from django.conf import settings
full_path = os.path.join(settings.MEDIA_ROOT, media_file)
if os.path.exists(full_path):
# 添加图片到文档
doc.add_picture(full_path, width=Inches(4.0))
else:
# 如果是URL格式的媒体文件
if media_file.startswith('http'):
response = requests.get(media_file, timeout=10)
image_stream = BytesIO(response.content)
doc.add_picture(image_stream, width=Inches(4.0))
else:
doc.add_paragraph(media_file)
except Exception as e:
doc.add_paragraph(media_file)
doc.add_page_break()
# 将文档保存到内存中再写入ZIP
doc_buffer = BytesIO()
doc.save(doc_buffer)
doc_buffer.seek(0)
zipf.writestr(data_filename, doc_buffer.read())
except ImportError:
zipf.writestr(data_filename, "错误缺少python-docx库无法生成Word文档")
# 添加媒体文件
for media_path in media_files: