Add export into front
This commit is contained in:
@@ -1,4 +1,3 @@
|
||||
# core/utils.py
|
||||
import os
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
@@ -42,6 +41,12 @@ def download_media(url, save_dir):
|
||||
filename += '.png'
|
||||
elif 'image/gif' in content_type:
|
||||
filename += '.gif'
|
||||
elif 'video/mp4' in content_type:
|
||||
filename += '.mp4'
|
||||
elif 'video/avi' in content_type:
|
||||
filename += '.avi'
|
||||
elif 'video/quicktime' in content_type:
|
||||
filename += '.mov'
|
||||
else:
|
||||
filename += '.bin' # 默认二进制扩展名
|
||||
|
||||
@@ -61,6 +66,7 @@ def download_media(url, save_dir):
|
||||
|
||||
|
||||
def process_article(url, website):
|
||||
# 检查文章是否已存在,如果存在则跳过
|
||||
if Article.objects.filter(url=url).exists():
|
||||
print(f"文章已存在,跳过: {url}")
|
||||
return
|
||||
@@ -116,6 +122,8 @@ def process_article(url, website):
|
||||
return
|
||||
|
||||
imgs = content_tag.find_all("img")
|
||||
# 查找视频元素
|
||||
videos = content_tag.find_all("video")
|
||||
media_files = []
|
||||
|
||||
safe_title = "".join(c if c.isalnum() else "_" for c in title)[:50]
|
||||
@@ -134,17 +142,51 @@ def process_article(url, website):
|
||||
img["src"] = settings.MEDIA_URL + rel_path.replace("\\", "/")
|
||||
media_files.append(rel_path.replace("\\", "/"))
|
||||
|
||||
# 处理视频文件
|
||||
for video in videos:
|
||||
src = video.get("src")
|
||||
if not src:
|
||||
# 检查<source>标签
|
||||
source = video.find("source")
|
||||
if source:
|
||||
src = source.get("src")
|
||||
|
||||
if not src:
|
||||
continue
|
||||
|
||||
if not src.startswith("http"):
|
||||
src = urljoin(url, src)
|
||||
local_path = download_media(src, save_dir)
|
||||
if local_path:
|
||||
rel_path = os.path.relpath(local_path, settings.MEDIA_ROOT)
|
||||
# 更新视频src属性
|
||||
if video.get("src"):
|
||||
video["src"] = settings.MEDIA_URL + rel_path.replace("\\", "/")
|
||||
else:
|
||||
source = video.find("source")
|
||||
if source:
|
||||
source["src"] = settings.MEDIA_URL + rel_path.replace("\\", "/")
|
||||
media_files.append(rel_path.replace("\\", "/"))
|
||||
|
||||
content_html = str(content_tag)
|
||||
|
||||
article = Article.objects.create(
|
||||
website=website,
|
||||
title=title,
|
||||
url=url,
|
||||
content=content_html,
|
||||
pub_date=timezone.now(),
|
||||
media_files=media_files
|
||||
)
|
||||
print(f"已保存文章及图片:{title}")
|
||||
try:
|
||||
# 使用try-except处理可能的数据库约束错误
|
||||
article = Article.objects.create(
|
||||
website=website,
|
||||
title=title,
|
||||
url=url,
|
||||
content=content_html,
|
||||
pub_date=timezone.now(),
|
||||
media_files=media_files
|
||||
)
|
||||
print(f"已保存文章及图片:{title}")
|
||||
except Exception as e:
|
||||
# 处理重复URL或其他数据库错误
|
||||
if "UNIQUE constraint failed" in str(e) and "core_article.url" in str(e):
|
||||
print(f"文章URL重复,跳过保存: {url}")
|
||||
else:
|
||||
print(f"保存文章时出错: {url},错误:{e}")
|
||||
|
||||
|
||||
def is_valid_url(url, base_netloc):
|
||||
|
||||
Reference in New Issue
Block a user