From 490cc835d2b7a3e3c6e5271e49c8c46a91fde35f Mon Sep 17 00:00:00 2001
From: yuangyaa <yuangyaa@163.com>
Date: Fri, 15 Aug 2025 04:02:03 +0800
Subject: [PATCH] fix qiushi bug

---
 core/utils.py | 39 ++++++++++++++++++++-------------------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/core/utils.py b/core/utils.py
index 1ae3a49..933f087 100644
--- a/core/utils.py
+++ b/core/utils.py
@@ -236,12 +236,16 @@ def process_article(url, website):
                 soup.find("div", class_="article")
         )
     elif "求是" in website.name:
-        # 求是网站的文章结构处理 - 修复两个标题问题
+        # 求是网站的文章结构处理 - 修复标题和正文清理问题
         title_tag = (
                 soup.find("h1", class_="title") or
+                soup.find("h2", class_="title") or
                 soup.find("h1") or
+                soup.find("h2") or
+                soup.find("p", class_="title") or
                 soup.find("title")
         )
+
         content_tag = (
                 soup.find("div", class_="content") or
                 soup.find("div", class_="article-content") or
@@ -251,20 +255,19 @@ def process_article(url, website):
                 soup.find("div", class_="article")
         )
 
-        # 针对求是网的特殊处理，清理内容中的重复标题和无关元素
         if content_tag:
-            # 移除重复标题：查找与文章标题相同的strong标签并移除
+            # 移除重复标题（放宽匹配条件，允许部分匹配）
             if title_tag:
                 title_text = title_tag.get_text(strip=True)
-                # 查找内容中与标题相同的strong标签（通常出现在正文第一段）
-                for strong_tag in content_tag.find_all("strong"):
-                    if strong_tag.get_text().strip() == title_text:
-                        # 检查是否是正文第一段中的重复标题
-                        parent_p = strong_tag.find_parent("p")
-                        if parent_p and parent_p == content_tag.find("p"):
-                            strong_tag.decompose()
+                if title_text:
+                    for strong_tag in content_tag.find_all("strong"):
+                        strong_text = strong_tag.get_text(strip=True)
+                        if title_text in strong_text or strong_text in title_text:
+                            parent_p = strong_tag.find_parent("p")
+                            # 如果 strong 在正文前两段内，就删除
+                            if parent_p in content_tag.find_all("p")[:2]:
+                                strong_tag.decompose()
 
-            # 移除无关的元素
             # 移除分享相关元素
             for share_element in content_tag.find_all("div", class_="sharebox"):
                 share_element.decompose()
@@ -275,8 +278,8 @@ def process_article(url, website):
 
             # 移除编辑信息
             for editor_element in content_tag.find_all("div", class_="fs-text"):
-                if editor_element.get_text() and (
-                        "网站编辑" in editor_element.get_text() or "审核" in editor_element.get_text()):
+                text = editor_element.get_text(strip=True)
+                if text and ("网站编辑" in text or "审核" in text):
                     editor_element.decompose()
 
             # 移除声明链接
@@ -290,18 +293,16 @@ def process_article(url, website):
             # 移除分隔线
             for line_element in content_tag.find_all("div", class_="fs-line"):
                 line_element.decompose()
-
             for line_element in content_tag.find_all("div", class_="fs-line_b"):
                 line_element.decompose()
 
-            # 移除剪贴板相关元素
+            # unwrap 剪贴板相关元素（保留文字，去掉外层标签）
             for clipboard_element in content_tag.find_all("div", class_="clipboard_text"):
-                clipboard_element.unwrap()  # unwrap只移除标签，保留内容
+                clipboard_element.unwrap()
 
-            # 移除highlight包装层，保留内容
+            # unwrap highlight 包装层（保留文字）
             for highlight_element in content_tag.find_all("div", class_="highlight"):
-                highlight_element.unwrap()  # unwrap只移除标签，保留内容
-
+                highlight_element.unwrap()
     elif "解放军报" in website.name or "81.cn" in website.name:
         # 解放军报的文章结构处理 - 修复类别爬取问题
         title_tag = (