Add packages

This commit is contained in:
2025-08-11 22:55:57 +08:00
parent d9d2ea9d99
commit bfd1604872
3 changed files with 56 additions and 18 deletions

View File

@@ -29,10 +29,10 @@ def download_media(url, save_dir):
if not filename or '.' not in filename:
# 如果URL路径中没有有效的文件名使用默认名称
filename = 'media_file'
# 清理文件名中的特殊字符
filename = re.sub(r'[^\w\-_\.]', '_', filename)
# 确保文件有扩展名
if '.' not in filename:
content_type = resp.headers.get('content-type', '')
@@ -77,11 +77,11 @@ def process_article(url, website):
elif website.name == "东方烟草报":
# 优化东方烟草报的标题提取逻辑,按优先级尝试多种选择器
title_tag = (
soup.find("h1", id="title") or # 特别针对带id="title"的h1标签
soup.find("h1") or # 主要标题标签
soup.find("title") or # 页面title标签
soup.find("div", class_="title") or # 某些页面可能使用div.title
soup.find("h2") # 备选标题标签
soup.find("h1", id="title") or # 特别针对带id="title"的h1标签
soup.find("h1") or # 主要标题标签
soup.find("title") or # 页面title标签
soup.find("div", class_="title") or # 某些页面可能使用div.title
soup.find("h2") # 备选标题标签
)
content_tag = soup.find("div", class_="content") # 东方烟草报的内容通常在div.content中
# 增加对另一种内容结构的支持
@@ -96,7 +96,7 @@ def process_article(url, website):
content_tag = soup.find("div", class_="content") or soup.find("div", id="content")
title = title_tag.get_text(strip=True) if title_tag else "无标题"
# 对标题进行额外处理,去除可能的多余空白字符
title = title.strip() if title else "无标题"
@@ -184,16 +184,16 @@ def full_site_crawler(start_url, website, max_pages=1000):
parsed_url = urlparse(url)
path = parsed_url.path
is_article_page = (
soup.find("div", class_="content") is not None or
soup.find("div", id="gallery") is not None or
soup.find("div", id="ContentText") is not None or
("/content/" in path and len(path) > 20)
soup.find("div", class_="content") is not None or
soup.find("div", id="gallery") is not None or
soup.find("div", id="ContentText") is not None or
("/content/" in path and len(path) > 20)
)
else:
# 默认判断逻辑
is_article_page = (
soup.find("div", class_="content") is not None or
soup.find("div", id="content") is not None
soup.find("div", class_="content") is not None or
soup.find("div", id="content") is not None
)
# 如果是文章页面,则调用文章处理
@@ -205,4 +205,4 @@ def full_site_crawler(start_url, website, max_pages=1000):
for link in soup.find_all("a", href=True):
href = urljoin(url, link["href"])
if href not in visited and is_valid_url(href, base_netloc):
queue.append(href)
queue.append(href)