Fix chinadaily bug : Support more packages
This commit is contained in:
@@ -3,6 +3,7 @@ from core.models import Website
|
|||||||
from core.utils import full_site_crawler
|
from core.utils import full_site_crawler
|
||||||
|
|
||||||
|
|
||||||
|
# jimmy.fang-20250815: 因URL问题,移除中国网-省份
|
||||||
class Command(BaseCommand):
|
class Command(BaseCommand):
|
||||||
help = "全站递归爬取 中国网主网及中国网一省份,不转发二级子网站"
|
help = "全站递归爬取 中国网主网及中国网一省份,不转发二级子网站"
|
||||||
|
|
||||||
@@ -22,12 +23,12 @@ class Command(BaseCommand):
|
|||||||
'start_url': 'http://www.china.com.cn',
|
'start_url': 'http://www.china.com.cn',
|
||||||
'article_selector': 'a'
|
'article_selector': 'a'
|
||||||
},
|
},
|
||||||
'province': {
|
# 'province': {
|
||||||
'name': '中国网一省份',
|
# 'name': '中国网一省份',
|
||||||
'base_url': 'http://www.china.com.cn',
|
# 'base_url': 'http://www.china.com.cn',
|
||||||
'start_url': 'http://www.china.com.cn/province',
|
# 'start_url': 'http://www.china.com.cn/province',
|
||||||
'article_selector': 'a'
|
# 'article_selector': 'a'
|
||||||
}
|
# }
|
||||||
}
|
}
|
||||||
|
|
||||||
if platform == 'all':
|
if platform == 'all':
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ class Command(BaseCommand):
|
|||||||
def add_arguments(self, parser):
|
def add_arguments(self, parser):
|
||||||
parser.add_argument('--platform', type=str, default='all',
|
parser.add_argument('--platform', type=str, default='all',
|
||||||
choices=['chinadaily', 'mobile', 'all'],
|
choices=['chinadaily', 'mobile', 'all'],
|
||||||
help='选择爬取平台: chinadaily(中国日报), mobile(移动端), all(全部)')
|
help='选择爬取平台: chinadaily(中国日报), all(全部)')
|
||||||
|
|
||||||
def handle(self, *args, **options):
|
def handle(self, *args, **options):
|
||||||
platform = options['platform']
|
platform = options['platform']
|
||||||
@@ -22,12 +22,7 @@ class Command(BaseCommand):
|
|||||||
'start_url': 'https://www.chinadaily.com.cn',
|
'start_url': 'https://www.chinadaily.com.cn',
|
||||||
'article_selector': 'a'
|
'article_selector': 'a'
|
||||||
},
|
},
|
||||||
'mobile': {
|
|
||||||
'name': '中国日报移动端',
|
|
||||||
'base_url': 'https://m.chinadaily.com.cn',
|
|
||||||
'start_url': 'https://m.chinadaily.com.cn',
|
|
||||||
'article_selector': 'a'
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if platform == 'all':
|
if platform == 'all':
|
||||||
|
|||||||
@@ -233,12 +233,8 @@ def process_article(url, website):
|
|||||||
soup.find("div", id="content") or
|
soup.find("div", id="content") or
|
||||||
soup.find("div", class_="text") or
|
soup.find("div", class_="text") or
|
||||||
soup.find("div", class_="main-content") or
|
soup.find("div", class_="main-content") or
|
||||||
soup.find("div", class_="article") or
|
soup.find("div", class_="article")
|
||||||
# 添加央视新闻特有的内容容器
|
|
||||||
soup.find("div", class_="content_area") or
|
|
||||||
soup.find("div", id="content_area")
|
|
||||||
)
|
)
|
||||||
|
|
||||||
elif "求是" in website.name:
|
elif "求是" in website.name:
|
||||||
# 求是网站的文章结构处理 - 修复两个标题问题
|
# 求是网站的文章结构处理 - 修复两个标题问题
|
||||||
title_tag = (
|
title_tag = (
|
||||||
@@ -360,6 +356,7 @@ def process_article(url, website):
|
|||||||
soup.find("title")
|
soup.find("title")
|
||||||
)
|
)
|
||||||
content_tag = (
|
content_tag = (
|
||||||
|
soup.find("div", id="Content") or # 中国日报特有内容容器
|
||||||
soup.find("div", class_="content") or
|
soup.find("div", class_="content") or
|
||||||
soup.find("div", class_="article-content") or
|
soup.find("div", class_="article-content") or
|
||||||
soup.find("div", id="content") or
|
soup.find("div", id="content") or
|
||||||
@@ -368,6 +365,41 @@ def process_article(url, website):
|
|||||||
soup.find("div", class_="article") or
|
soup.find("div", class_="article") or
|
||||||
soup.find("div", class_="article-body")
|
soup.find("div", class_="article-body")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# 处理中国日报的分页内容
|
||||||
|
if content_tag and ("中国日报" in website.name or "chinadaily" in website.name):
|
||||||
|
# 查找分页链接
|
||||||
|
page_links = []
|
||||||
|
current_page_elem = soup.find("div", id="div_currpage")
|
||||||
|
if current_page_elem:
|
||||||
|
# 查找所有分页链接
|
||||||
|
page_links = [a for a in current_page_elem.find_all("a", href=True) if not a.find("img")]
|
||||||
|
|
||||||
|
# 如果有分页,收集所有页面内容
|
||||||
|
if page_links:
|
||||||
|
print(f"发现分页内容,共 {len(page_links)} 页需要处理")
|
||||||
|
# 收集所有页面的内容
|
||||||
|
all_content_html = str(content_tag)
|
||||||
|
|
||||||
|
# 处理每个分页链接
|
||||||
|
for page_link in page_links:
|
||||||
|
page_url = urljoin(url, page_link['href'])
|
||||||
|
if page_url != url: # 避免重复处理第一页
|
||||||
|
try:
|
||||||
|
page_resp = requests.get(page_url, headers=headers, timeout=15)
|
||||||
|
page_resp.encoding = 'utf-8'
|
||||||
|
page_soup = BeautifulSoup(page_resp.text, "html.parser")
|
||||||
|
|
||||||
|
# 提取分页内容
|
||||||
|
page_content = page_soup.find("div", id="Content")
|
||||||
|
if page_content:
|
||||||
|
all_content_html += str(page_content)
|
||||||
|
print(f"已处理分页: {page_url}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"处理分页失败 {page_url}: {e}")
|
||||||
|
|
||||||
|
# 更新content_tag为包含所有分页内容
|
||||||
|
content_tag = BeautifulSoup(all_content_html, "html.parser")
|
||||||
elif "工人日报" in website.name or "workercn" in website.name:
|
elif "工人日报" in website.name or "workercn" in website.name:
|
||||||
# 工人日报的文章结构处理 - 修复不保存文章内容问题
|
# 工人日报的文章结构处理 - 修复不保存文章内容问题
|
||||||
title_tag = (
|
title_tag = (
|
||||||
@@ -1003,6 +1035,8 @@ def full_site_crawler(start_url, website, max_pages=1000):
|
|||||||
soup.find("div", class_="article-content") is not None or
|
soup.find("div", class_="article-content") is not None or
|
||||||
(soup.find("div", id="content") is not None and
|
(soup.find("div", id="content") is not None and
|
||||||
soup.find("h1") is not None) or
|
soup.find("h1") is not None) or
|
||||||
|
(soup.find("div", id="Content") is not None and # 中国日报特有内容容器
|
||||||
|
soup.find("h1") is not None) or
|
||||||
soup.find("div", class_="text") is not None or
|
soup.find("div", class_="text") is not None or
|
||||||
soup.find("div", class_="main-content") is not None or
|
soup.find("div", class_="main-content") is not None or
|
||||||
soup.find("div", class_="article") is not None or
|
soup.find("div", class_="article") is not None or
|
||||||
|
|||||||
Reference in New Issue
Block a user