Add Find weburl
This commit is contained in:
@@ -1,9 +1,9 @@
|
||||
import os
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urljoin, urlparse
|
||||
from django.utils import timezone
|
||||
from django.conf import settings
|
||||
from urllib.parse import urljoin
|
||||
from core.models import Article
|
||||
|
||||
def download_media(url, save_dir):
|
||||
@@ -27,12 +27,12 @@ def download_media(url, save_dir):
|
||||
|
||||
with open(filepath, "wb") as f:
|
||||
f.write(resp.content)
|
||||
return filepath
|
||||
|
||||
# 返回相对路径,方便存数据库和展示
|
||||
return os.path.relpath(filepath, settings.MEDIA_ROOT).replace("\\", "/")
|
||||
|
||||
def crawl_xinhua_article(url, website):
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0"
|
||||
}
|
||||
headers = {"User-Agent": "Mozilla/5.0"}
|
||||
resp = requests.get(url, headers=headers)
|
||||
resp.encoding = 'utf-8'
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
@@ -42,7 +42,7 @@ def crawl_xinhua_article(url, website):
|
||||
|
||||
content_tag = soup.find("span", id="detailContent")
|
||||
if not content_tag:
|
||||
print("没有找到正文")
|
||||
print(f"没有找到正文,跳过文章: {url}")
|
||||
return
|
||||
|
||||
imgs = content_tag.find_all("img")
|
||||
@@ -54,17 +54,17 @@ def crawl_xinhua_article(url, website):
|
||||
|
||||
for img in imgs:
|
||||
src = img.get("src")
|
||||
print("原始图片 src =", src)
|
||||
if not src:
|
||||
continue
|
||||
# 用文章页面url作为base拼接,确保拼出完整图片链接
|
||||
src = urljoin(url, src)
|
||||
print("拼接后图片 URL =", src)
|
||||
local_path = download_media(src, save_dir)
|
||||
if local_path:
|
||||
rel_path = os.path.relpath(local_path, settings.MEDIA_ROOT)
|
||||
img["src"] = settings.MEDIA_URL + rel_path.replace("\\", "/")
|
||||
media_files.append(rel_path.replace("\\", "/"))
|
||||
|
||||
# 这里用文章URL作为基准拼接相对路径,避免错误
|
||||
if not src.startswith("http"):
|
||||
src = urljoin(url, src)
|
||||
|
||||
local_rel_path = download_media(src, save_dir)
|
||||
if local_rel_path:
|
||||
img["src"] = settings.MEDIA_URL + local_rel_path
|
||||
media_files.append(local_rel_path)
|
||||
|
||||
content_html = str(content_tag)
|
||||
|
||||
@@ -81,3 +81,23 @@ def crawl_xinhua_article(url, website):
|
||||
media_files=media_files
|
||||
)
|
||||
print(f"已保存文章及图片:{title}")
|
||||
|
||||
def crawl_xinhua_list(list_url, website):
|
||||
headers = {"User-Agent": "Mozilla/5.0"}
|
||||
resp = requests.get(list_url, headers=headers)
|
||||
resp.encoding = 'utf-8'
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
|
||||
article_urls = set()
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
if href.startswith("https://www.news.cn/legal/") and href.endswith("c.html"):
|
||||
article_urls.add(href)
|
||||
|
||||
print(f"在列表页找到 {len(article_urls)} 篇文章链接")
|
||||
for url in article_urls:
|
||||
print("文章链接:", url)
|
||||
|
||||
from core.utils import crawl_xinhua_article
|
||||
for article_url in article_urls:
|
||||
crawl_xinhua_article(article_url, website)
|
||||
|
||||
Reference in New Issue
Block a user