Add Support dongfangyaocao

2025-08-11 22:20:19 +08:00
parent 6d80326a4e
commit d9d2ea9d99
11 changed files with 686 additions and 58 deletions
--- a/core/utils.py
+++ b/core/utils.py
@@ -7,16 +7,44 @@ from collections import deque
 from django.utils import timezone
 from django.conf import settings
 from core.models import Article
+import re
+

 def download_media(url, save_dir):
    try:
-        resp = requests.get(url, timeout=15)
+        # 添加请求头以避免403 Forbidden错误
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+            "Referer": urljoin(url, "/")
+        }
+        resp = requests.get(url, timeout=15, headers=headers)
        resp.raise_for_status()
    except Exception as e:
        print(f"下载失败：{url}，错误：{e}")
        return None

-    filename = url.split("/")[-1].split("?")[0]
+    # 更安全地处理文件名，去除查询参数并处理特殊字符
+    parsed_url = urlparse(url)
+    filename = os.path.basename(parsed_url.path)
+    if not filename or '.' not in filename:
+        # 如果URL路径中没有有效的文件名，使用默认名称
+        filename = 'media_file'
+        
+    # 清理文件名中的特殊字符
+    filename = re.sub(r'[^\w\-_\.]', '_', filename)
+    
+    # 确保文件有扩展名
+    if '.' not in filename:
+        content_type = resp.headers.get('content-type', '')
+        if 'image/jpeg' in content_type:
+            filename += '.jpg'
+        elif 'image/png' in content_type:
+            filename += '.png'
+        elif 'image/gif' in content_type:
+            filename += '.gif'
+        else:
+            filename += '.bin'  # 默认二进制扩展名
+
    os.makedirs(save_dir, exist_ok=True)
    filepath = os.path.join(save_dir, filename)

@@ -31,6 +59,7 @@ def download_media(url, save_dir):
        f.write(resp.content)
    return filepath

+
 def process_article(url, website):
    if Article.objects.filter(url=url).exists():
        print(f"文章已存在，跳过: {url}")
@@ -41,10 +70,36 @@ def process_article(url, website):
    resp.encoding = 'utf-8'
    soup = BeautifulSoup(resp.text, "html.parser")

-    title_tag = soup.find("span", class_="title")
-    title = title_tag.get_text(strip=True) if title_tag else "无标题"
+    # 处理不同网站的文章结构
+    if website.name == "www.news.cn":
+        title_tag = soup.find("span", class_="title")
+        content_tag = soup.find("span", id="detailContent")
+    elif website.name == "东方烟草报":
+        # 优化东方烟草报的标题提取逻辑，按优先级尝试多种选择器
+        title_tag = (
+            soup.find("h1", id="title") or  # 特别针对带id="title"的h1标签
+            soup.find("h1") or  # 主要标题标签
+            soup.find("title") or  # 页面title标签
+            soup.find("div", class_="title") or  # 某些页面可能使用div.title
+            soup.find("h2")  # 备选标题标签
+        )
+        content_tag = soup.find("div", class_="content")  # 东方烟草报的内容通常在div.content中
+        # 增加对另一种内容结构的支持
+        if not content_tag:
+            content_tag = soup.find("div", id="gallery")
+        # 再增加对新内容结构的支持
+        if not content_tag:
+            content_tag = soup.find("div", id="ContentText")
+    else:
+        # 默认处理方式
+        title_tag = soup.find("h1") or soup.find("title")
+        content_tag = soup.find("div", class_="content") or soup.find("div", id="content")
+
+    title = title_tag.get_text(strip=True) if title_tag else "无标题"
+    
+    # 对标题进行额外处理，去除可能的多余空白字符
+    title = title.strip() if title else "无标题"

-    content_tag = soup.find("span", id="detailContent")
    if not content_tag:
        print("没有找到正文，跳过:", url)
        return
@@ -80,6 +135,7 @@ def process_article(url, website):
    )
    print(f"已保存文章及图片：{title}")

+
 def is_valid_url(url, base_netloc):
    try:
        parsed = urlparse(url)
@@ -91,6 +147,7 @@ def is_valid_url(url, base_netloc):
    except Exception:
        return False

+
 def full_site_crawler(start_url, website, max_pages=1000):
    headers = {"User-Agent": "Mozilla/5.0"}
    visited = set()
@@ -117,8 +174,30 @@ def full_site_crawler(start_url, website, max_pages=1000):
        resp.encoding = 'utf-8'
        soup = BeautifulSoup(resp.text, "html.parser")

+        # 根据不同网站判断文章页面
+        is_article_page = False
+        if website.name == "www.news.cn":
+            is_article_page = soup.find("span", id="detailContent") is not None
+        elif website.name == "东方烟草报":
+            # 对于东方烟草报，我们增加基于URL模式的判断
+            # 东方烟草报的文章URL通常包含/content/和日期格式
+            parsed_url = urlparse(url)
+            path = parsed_url.path
+            is_article_page = (
+                soup.find("div", class_="content") is not None or
+                soup.find("div", id="gallery") is not None or
+                soup.find("div", id="ContentText") is not None or
+                ("/content/" in path and len(path) > 20)
+            )
+        else:
+            # 默认判断逻辑
+            is_article_page = (
+                soup.find("div", class_="content") is not None or 
+                soup.find("div", id="content") is not None
+            )
+
        # 如果是文章页面，则调用文章处理
-        if soup.find("span", id="detailContent"):
+        if is_article_page:
            process_article(url, website)
            pages_crawled += 1

@@ -126,4 +205,4 @@ def full_site_crawler(start_url, website, max_pages=1000):
        for link in soup.find_all("a", href=True):
            href = urljoin(url, link["href"])
            if href not in visited and is_valid_url(href, base_netloc):
-                queue.append(href)
+                queue.append(href)