104 lines
3.2 KiB
Python
104 lines
3.2 KiB
Python
import os
|
||
import requests
|
||
from bs4 import BeautifulSoup
|
||
from urllib.parse import urljoin, urlparse
|
||
from django.utils import timezone
|
||
from django.conf import settings
|
||
from core.models import Article
|
||
|
||
def download_media(url, save_dir):
|
||
try:
|
||
resp = requests.get(url, timeout=15)
|
||
resp.raise_for_status()
|
||
except Exception as e:
|
||
print(f"下载失败:{url},错误:{e}")
|
||
return None
|
||
|
||
filename = url.split("/")[-1].split("?")[0]
|
||
os.makedirs(save_dir, exist_ok=True)
|
||
filepath = os.path.join(save_dir, filename)
|
||
|
||
base, ext = os.path.splitext(filename)
|
||
counter = 1
|
||
while os.path.exists(filepath):
|
||
filename = f"{base}_{counter}{ext}"
|
||
filepath = os.path.join(save_dir, filename)
|
||
counter += 1
|
||
|
||
with open(filepath, "wb") as f:
|
||
f.write(resp.content)
|
||
|
||
# 返回相对路径,方便存数据库和展示
|
||
return os.path.relpath(filepath, settings.MEDIA_ROOT).replace("\\", "/")
|
||
|
||
def crawl_xinhua_article(url, website):
|
||
headers = {"User-Agent": "Mozilla/5.0"}
|
||
resp = requests.get(url, headers=headers)
|
||
resp.encoding = 'utf-8'
|
||
soup = BeautifulSoup(resp.text, "html.parser")
|
||
|
||
title_tag = soup.find("span", class_="title")
|
||
title = title_tag.get_text(strip=True) if title_tag else "无标题"
|
||
|
||
content_tag = soup.find("span", id="detailContent")
|
||
if not content_tag:
|
||
print(f"没有找到正文,跳过文章: {url}")
|
||
return
|
||
|
||
imgs = content_tag.find_all("img")
|
||
media_files = []
|
||
|
||
safe_title = "".join(c if c.isalnum() else "_" for c in title)[:50]
|
||
save_dir = os.path.join(settings.MEDIA_ROOT, "articles", safe_title)
|
||
os.makedirs(save_dir, exist_ok=True)
|
||
|
||
for img in imgs:
|
||
src = img.get("src")
|
||
if not src:
|
||
continue
|
||
|
||
# 这里用文章URL作为基准拼接相对路径,避免错误
|
||
if not src.startswith("http"):
|
||
src = urljoin(url, src)
|
||
|
||
local_rel_path = download_media(src, save_dir)
|
||
if local_rel_path:
|
||
img["src"] = settings.MEDIA_URL + local_rel_path
|
||
media_files.append(local_rel_path)
|
||
|
||
content_html = str(content_tag)
|
||
|
||
if Article.objects.filter(url=url).exists():
|
||
print(f"文章已存在,跳过: {url}")
|
||
return
|
||
|
||
article = Article.objects.create(
|
||
website=website,
|
||
title=title,
|
||
url=url,
|
||
content=content_html,
|
||
pub_date=timezone.now(),
|
||
media_files=media_files
|
||
)
|
||
print(f"已保存文章及图片:{title}")
|
||
|
||
def crawl_xinhua_list(list_url, website):
|
||
headers = {"User-Agent": "Mozilla/5.0"}
|
||
resp = requests.get(list_url, headers=headers)
|
||
resp.encoding = 'utf-8'
|
||
soup = BeautifulSoup(resp.text, "html.parser")
|
||
|
||
article_urls = set()
|
||
for link in soup.find_all("a", href=True):
|
||
href = link["href"]
|
||
if href.startswith("https://www.news.cn/legal/") and href.endswith("c.html"):
|
||
article_urls.add(href)
|
||
|
||
print(f"在列表页找到 {len(article_urls)} 篇文章链接")
|
||
for url in article_urls:
|
||
print("文章链接:", url)
|
||
|
||
from core.utils import crawl_xinhua_article
|
||
for article_url in article_urls:
|
||
crawl_xinhua_article(article_url, website)
|