Add export into front

This commit is contained in:
2025-08-13 00:26:39 +08:00
parent baea50bfa0
commit 5e396796ca
9 changed files with 1272 additions and 175 deletions

View File

@@ -1,4 +1,3 @@
# core/utils.py
import os
import requests
from bs4 import BeautifulSoup
@@ -42,6 +41,12 @@ def download_media(url, save_dir):
filename += '.png'
elif 'image/gif' in content_type:
filename += '.gif'
elif 'video/mp4' in content_type:
filename += '.mp4'
elif 'video/avi' in content_type:
filename += '.avi'
elif 'video/quicktime' in content_type:
filename += '.mov'
else:
filename += '.bin' # 默认二进制扩展名
@@ -61,6 +66,7 @@ def download_media(url, save_dir):
def process_article(url, website):
# 检查文章是否已存在,如果存在则跳过
if Article.objects.filter(url=url).exists():
print(f"文章已存在,跳过: {url}")
return
@@ -116,6 +122,8 @@ def process_article(url, website):
return
imgs = content_tag.find_all("img")
# 查找视频元素
videos = content_tag.find_all("video")
media_files = []
safe_title = "".join(c if c.isalnum() else "_" for c in title)[:50]
@@ -134,17 +142,51 @@ def process_article(url, website):
img["src"] = settings.MEDIA_URL + rel_path.replace("\\", "/")
media_files.append(rel_path.replace("\\", "/"))
# 处理视频文件
for video in videos:
src = video.get("src")
if not src:
# 检查<source>标签
source = video.find("source")
if source:
src = source.get("src")
if not src:
continue
if not src.startswith("http"):
src = urljoin(url, src)
local_path = download_media(src, save_dir)
if local_path:
rel_path = os.path.relpath(local_path, settings.MEDIA_ROOT)
# 更新视频src属性
if video.get("src"):
video["src"] = settings.MEDIA_URL + rel_path.replace("\\", "/")
else:
source = video.find("source")
if source:
source["src"] = settings.MEDIA_URL + rel_path.replace("\\", "/")
media_files.append(rel_path.replace("\\", "/"))
content_html = str(content_tag)
article = Article.objects.create(
website=website,
title=title,
url=url,
content=content_html,
pub_date=timezone.now(),
media_files=media_files
)
print(f"已保存文章及图片:{title}")
try:
# 使用try-except处理可能的数据库约束错误
article = Article.objects.create(
website=website,
title=title,
url=url,
content=content_html,
pub_date=timezone.now(),
media_files=media_files
)
print(f"已保存文章及图片:{title}")
except Exception as e:
# 处理重复URL或其他数据库错误
if "UNIQUE constraint failed" in str(e) and "core_article.url" in str(e):
print(f"文章URL重复跳过保存: {url}")
else:
print(f"保存文章时出错: {url},错误:{e}")
def is_valid_url(url, base_netloc):