diff --git a/core/utils.py b/core/utils.py index 11e1138..1ae3a49 100644 --- a/core/utils.py +++ b/core/utils.py @@ -618,7 +618,7 @@ def process_article(url, website): soup.find("div", class_="article-body") ) - # 如果找到content-two,需要进一步处理去除内部的标题 + # 如果找到content-two,需要进一步处理去除内部的标题元素(避免重复) if content_tag and content_tag.get('class') and 'content-two' in content_tag.get('class', []): # 查找并移除内容中的标题元素(避免重复) inner_titles = content_tag.find_all(['h1', 'h2']) @@ -1205,6 +1205,7 @@ def full_site_crawler(start_url, website, max_pages=1000): soup.find("div", class_="main-content") is not None or soup.find("div", class_="article") is not None or soup.find("div", class_="article-body") is not None or + ("/content/" in path and "content_" in path) or # 法治日报特有的文章URL模式 ("/article/" in path) or ("/content/" in path) or (path.startswith("/detail/") and len(path) > 10)