Add Support full site

This commit is contained in:
2025-08-11 14:33:32 +08:00
parent 969d46b070
commit 6d80326a4e
2 changed files with 76 additions and 30 deletions

View File

@@ -1,7 +1,9 @@
# core/utils.py
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from collections import deque
from django.utils import timezone
from django.conf import settings
from core.models import Article
@@ -27,11 +29,13 @@ def download_media(url, save_dir):
with open(filepath, "wb") as f:
f.write(resp.content)
return filepath
# 返回相对路径,方便存数据库和展示
return os.path.relpath(filepath, settings.MEDIA_ROOT).replace("\\", "/")
def process_article(url, website):
if Article.objects.filter(url=url).exists():
print(f"文章已存在,跳过: {url}")
return
def crawl_xinhua_article(url, website):
headers = {"User-Agent": "Mozilla/5.0"}
resp = requests.get(url, headers=headers)
resp.encoding = 'utf-8'
@@ -42,7 +46,7 @@ def crawl_xinhua_article(url, website):
content_tag = soup.find("span", id="detailContent")
if not content_tag:
print(f"没有找到正文,跳过文章: {url}")
print("没有找到正文,跳过:", url)
return
imgs = content_tag.find_all("img")
@@ -56,22 +60,16 @@ def crawl_xinhua_article(url, website):
src = img.get("src")
if not src:
continue
# 这里用文章URL作为基准拼接相对路径避免错误
if not src.startswith("http"):
src = urljoin(url, src)
local_rel_path = download_media(src, save_dir)
if local_rel_path:
img["src"] = settings.MEDIA_URL + local_rel_path
media_files.append(local_rel_path)
local_path = download_media(src, save_dir)
if local_path:
rel_path = os.path.relpath(local_path, settings.MEDIA_ROOT)
img["src"] = settings.MEDIA_URL + rel_path.replace("\\", "/")
media_files.append(rel_path.replace("\\", "/"))
content_html = str(content_tag)
if Article.objects.filter(url=url).exists():
print(f"文章已存在,跳过: {url}")
return
article = Article.objects.create(
website=website,
title=title,
@@ -82,22 +80,50 @@ def crawl_xinhua_article(url, website):
)
print(f"已保存文章及图片:{title}")
def crawl_xinhua_list(list_url, website):
def is_valid_url(url, base_netloc):
try:
parsed = urlparse(url)
if parsed.scheme not in ("http", "https"):
return False
if parsed.netloc != base_netloc:
return False
return True
except Exception:
return False
def full_site_crawler(start_url, website, max_pages=1000):
headers = {"User-Agent": "Mozilla/5.0"}
resp = requests.get(list_url, headers=headers)
resp.encoding = 'utf-8'
soup = BeautifulSoup(resp.text, "html.parser")
visited = set()
queue = deque([start_url])
article_urls = set()
for link in soup.find_all("a", href=True):
href = link["href"]
if href.startswith("https://www.news.cn/legal/") and href.endswith("c.html"):
article_urls.add(href)
base_netloc = urlparse(start_url).netloc
print(f"在列表页找到 {len(article_urls)} 篇文章链接")
for url in article_urls:
print("文章链接:", url)
pages_crawled = 0
from core.utils import crawl_xinhua_article
for article_url in article_urls:
crawl_xinhua_article(article_url, website)
while queue and pages_crawled < max_pages:
url = queue.popleft()
if url in visited:
continue
print(f"正在爬取:{url}")
visited.add(url)
try:
resp = requests.get(url, headers=headers, timeout=15)
resp.raise_for_status()
except Exception as e:
print(f"请求失败:{url},错误:{e}")
continue
resp.encoding = 'utf-8'
soup = BeautifulSoup(resp.text, "html.parser")
# 如果是文章页面,则调用文章处理
if soup.find("span", id="detailContent"):
process_article(url, website)
pages_crawled += 1
# 扩展队列,发现新链接
for link in soup.find_all("a", href=True):
href = urljoin(url, link["href"])
if href not in visited and is_valid_url(href, base_netloc):
queue.append(href)