Fix fzrb bug : add support catch fzrb
This commit is contained in:
@@ -618,7 +618,7 @@ def process_article(url, website):
|
|||||||
soup.find("div", class_="article-body")
|
soup.find("div", class_="article-body")
|
||||||
)
|
)
|
||||||
|
|
||||||
# 如果找到content-two,需要进一步处理去除内部的标题
|
# 如果找到content-two,需要进一步处理去除内部的标题元素(避免重复)
|
||||||
if content_tag and content_tag.get('class') and 'content-two' in content_tag.get('class', []):
|
if content_tag and content_tag.get('class') and 'content-two' in content_tag.get('class', []):
|
||||||
# 查找并移除内容中的标题元素(避免重复)
|
# 查找并移除内容中的标题元素(避免重复)
|
||||||
inner_titles = content_tag.find_all(['h1', 'h2'])
|
inner_titles = content_tag.find_all(['h1', 'h2'])
|
||||||
@@ -1205,6 +1205,7 @@ def full_site_crawler(start_url, website, max_pages=1000):
|
|||||||
soup.find("div", class_="main-content") is not None or
|
soup.find("div", class_="main-content") is not None or
|
||||||
soup.find("div", class_="article") is not None or
|
soup.find("div", class_="article") is not None or
|
||||||
soup.find("div", class_="article-body") is not None or
|
soup.find("div", class_="article-body") is not None or
|
||||||
|
("/content/" in path and "content_" in path) or # 法治日报特有的文章URL模式
|
||||||
("/article/" in path) or
|
("/article/" in path) or
|
||||||
("/content/" in path) or
|
("/content/" in path) or
|
||||||
(path.startswith("/detail/") and len(path) > 10)
|
(path.startswith("/detail/") and len(path) > 10)
|
||||||
|
|||||||
Reference in New Issue
Block a user