From d64bf93988ca08321dd3aa108ee99f39419748f4 Mon Sep 17 00:00:00 2001
From: yuangyaa <yuangyaa@163.com>
Date: Fri, 15 Aug 2025 02:38:14 +0800
Subject: [PATCH] Fix fzrb bug : add support catch fzrb

---
 core/utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/core/utils.py b/core/utils.py
index 11e1138..1ae3a49 100644
--- a/core/utils.py
+++ b/core/utils.py
@@ -618,7 +618,7 @@ def process_article(url, website):
                 soup.find("div", class_="article-body")
         )
 
-        # 如果找到content-two，需要进一步处理去除内部的标题
+        # 如果找到content-two，需要进一步处理去除内部的标题元素（避免重复）
         if content_tag and content_tag.get('class') and 'content-two' in content_tag.get('class', []):
             # 查找并移除内容中的标题元素（避免重复）
             inner_titles = content_tag.find_all(['h1', 'h2'])
@@ -1205,6 +1205,7 @@ def full_site_crawler(start_url, website, max_pages=1000):
                     soup.find("div", class_="main-content") is not None or
                     soup.find("div", class_="article") is not None or
                     soup.find("div", class_="article-body") is not None or
+                    ("/content/" in path and "content_" in path) or  # 法治日报特有的文章URL模式
                     ("/article/" in path) or
                     ("/content/" in path) or
                     (path.startswith("/detail/") and len(path) > 10)