From bfd16048723286dfb7a5c6b36467704193ed061a Mon Sep 17 00:00:00 2001 From: yuangyaa Date: Mon, 11 Aug 2025 22:55:57 +0800 Subject: [PATCH] Add packages --- core/templates/core/article_detail.html | 2 +- core/templates/core/article_list.html | 42 +++++++++++++++++++++++-- core/utils.py | 30 +++++++++--------- 3 files changed, 56 insertions(+), 18 deletions(-) diff --git a/core/templates/core/article_detail.html b/core/templates/core/article_detail.html index f0aee66..155b24b 100644 --- a/core/templates/core/article_detail.html +++ b/core/templates/core/article_detail.html @@ -8,7 +8,7 @@ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif; line-height: 1.6; color: #333; - max-width: 800px; + max-width: 1200px; /* 修改:同步调整页面最大宽度与列表页一致 */ margin: 0 auto; padding: 20px; background-color: #f8f9fa; diff --git a/core/templates/core/article_list.html b/core/templates/core/article_list.html index 87764b1..faaa36f 100644 --- a/core/templates/core/article_list.html +++ b/core/templates/core/article_list.html @@ -8,7 +8,7 @@ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif; line-height: 1.6; color: #333; - max-width: 800px; + max-width: 1200px; /* 修改:增加页面最大宽度 */ margin: 0 auto; padding: 20px; background-color: #f8f9fa; @@ -80,7 +80,7 @@ color: white; text-decoration: none; border-radius: 4px; - margin: 0 5px; + margin: 0 2px; /* 修改:调整页码间距 */ } .pagination a:hover { background-color: #2980b9; @@ -89,6 +89,17 @@ margin: 0 10px; color: #7f8c8d; } + /* 新增:当前页码样式 */ + .pagination .current { + background-color: #2980b9; + cursor: default; + } + /* 新增:省略号样式 */ + .pagination .ellipsis { + display: inline-block; + padding: 8px 4px; + color: #7f8c8d; + } @@ -117,19 +128,46 @@ diff --git a/core/utils.py b/core/utils.py index 2fd3ff8..ce8236b 100644 --- a/core/utils.py +++ b/core/utils.py @@ -29,10 +29,10 @@ def download_media(url, save_dir): if not filename or '.' not in filename: # 如果URL路径中没有有效的文件名,使用默认名称 filename = 'media_file' - + # 清理文件名中的特殊字符 filename = re.sub(r'[^\w\-_\.]', '_', filename) - + # 确保文件有扩展名 if '.' not in filename: content_type = resp.headers.get('content-type', '') @@ -77,11 +77,11 @@ def process_article(url, website): elif website.name == "东方烟草报": # 优化东方烟草报的标题提取逻辑,按优先级尝试多种选择器 title_tag = ( - soup.find("h1", id="title") or # 特别针对带id="title"的h1标签 - soup.find("h1") or # 主要标题标签 - soup.find("title") or # 页面title标签 - soup.find("div", class_="title") or # 某些页面可能使用div.title - soup.find("h2") # 备选标题标签 + soup.find("h1", id="title") or # 特别针对带id="title"的h1标签 + soup.find("h1") or # 主要标题标签 + soup.find("title") or # 页面title标签 + soup.find("div", class_="title") or # 某些页面可能使用div.title + soup.find("h2") # 备选标题标签 ) content_tag = soup.find("div", class_="content") # 东方烟草报的内容通常在div.content中 # 增加对另一种内容结构的支持 @@ -96,7 +96,7 @@ def process_article(url, website): content_tag = soup.find("div", class_="content") or soup.find("div", id="content") title = title_tag.get_text(strip=True) if title_tag else "无标题" - + # 对标题进行额外处理,去除可能的多余空白字符 title = title.strip() if title else "无标题" @@ -184,16 +184,16 @@ def full_site_crawler(start_url, website, max_pages=1000): parsed_url = urlparse(url) path = parsed_url.path is_article_page = ( - soup.find("div", class_="content") is not None or - soup.find("div", id="gallery") is not None or - soup.find("div", id="ContentText") is not None or - ("/content/" in path and len(path) > 20) + soup.find("div", class_="content") is not None or + soup.find("div", id="gallery") is not None or + soup.find("div", id="ContentText") is not None or + ("/content/" in path and len(path) > 20) ) else: # 默认判断逻辑 is_article_page = ( - soup.find("div", class_="content") is not None or - soup.find("div", id="content") is not None + soup.find("div", class_="content") is not None or + soup.find("div", id="content") is not None ) # 如果是文章页面,则调用文章处理 @@ -205,4 +205,4 @@ def full_site_crawler(start_url, website, max_pages=1000): for link in soup.find_all("a", href=True): href = urljoin(url, link["href"]) if href not in visited and is_valid_url(href, base_netloc): - queue.append(href) \ No newline at end of file + queue.append(href)