Add packages
This commit is contained in:
@@ -8,7 +8,7 @@
|
|||||||
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
|
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
|
||||||
line-height: 1.6;
|
line-height: 1.6;
|
||||||
color: #333;
|
color: #333;
|
||||||
max-width: 800px;
|
max-width: 1200px; /* 修改:同步调整页面最大宽度与列表页一致 */
|
||||||
margin: 0 auto;
|
margin: 0 auto;
|
||||||
padding: 20px;
|
padding: 20px;
|
||||||
background-color: #f8f9fa;
|
background-color: #f8f9fa;
|
||||||
|
|||||||
@@ -8,7 +8,7 @@
|
|||||||
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
|
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
|
||||||
line-height: 1.6;
|
line-height: 1.6;
|
||||||
color: #333;
|
color: #333;
|
||||||
max-width: 800px;
|
max-width: 1200px; /* 修改:增加页面最大宽度 */
|
||||||
margin: 0 auto;
|
margin: 0 auto;
|
||||||
padding: 20px;
|
padding: 20px;
|
||||||
background-color: #f8f9fa;
|
background-color: #f8f9fa;
|
||||||
@@ -80,7 +80,7 @@
|
|||||||
color: white;
|
color: white;
|
||||||
text-decoration: none;
|
text-decoration: none;
|
||||||
border-radius: 4px;
|
border-radius: 4px;
|
||||||
margin: 0 5px;
|
margin: 0 2px; /* 修改:调整页码间距 */
|
||||||
}
|
}
|
||||||
.pagination a:hover {
|
.pagination a:hover {
|
||||||
background-color: #2980b9;
|
background-color: #2980b9;
|
||||||
@@ -89,6 +89,17 @@
|
|||||||
margin: 0 10px;
|
margin: 0 10px;
|
||||||
color: #7f8c8d;
|
color: #7f8c8d;
|
||||||
}
|
}
|
||||||
|
/* 新增:当前页码样式 */
|
||||||
|
.pagination .current {
|
||||||
|
background-color: #2980b9;
|
||||||
|
cursor: default;
|
||||||
|
}
|
||||||
|
/* 新增:省略号样式 */
|
||||||
|
.pagination .ellipsis {
|
||||||
|
display: inline-block;
|
||||||
|
padding: 8px 4px;
|
||||||
|
color: #7f8c8d;
|
||||||
|
}
|
||||||
</style>
|
</style>
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
@@ -117,19 +128,46 @@
|
|||||||
<div class="pagination">
|
<div class="pagination">
|
||||||
{% if page_obj.has_previous %}
|
{% if page_obj.has_previous %}
|
||||||
{% if selected_website %}
|
{% if selected_website %}
|
||||||
|
<a href="?website={{ selected_website.id }}&page=1">« 首页</a>
|
||||||
<a href="?website={{ selected_website.id }}&page={{ page_obj.previous_page_number }}">上一页</a>
|
<a href="?website={{ selected_website.id }}&page={{ page_obj.previous_page_number }}">上一页</a>
|
||||||
{% else %}
|
{% else %}
|
||||||
|
<a href="?page=1">« 首页</a>
|
||||||
<a href="?page={{ page_obj.previous_page_number }}">上一页</a>
|
<a href="?page={{ page_obj.previous_page_number }}">上一页</a>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
<span>第 {{ page_obj.number }} 页,共 {{ page_obj.paginator.num_pages }} 页</span>
|
<span>第 {{ page_obj.number }} 页,共 {{ page_obj.paginator.num_pages }} 页</span>
|
||||||
|
|
||||||
|
<!-- 修改:优化页码显示逻辑 -->
|
||||||
|
{% with page_obj.paginator as paginator %}
|
||||||
|
{% for num in paginator.page_range %}
|
||||||
|
{% if page_obj.number == num %}
|
||||||
|
<a href="#" class="current">{{ num }}</a>
|
||||||
|
{% elif num > page_obj.number|add:'-3' and num < page_obj.number|add:'3' %}
|
||||||
|
{% if selected_website %}
|
||||||
|
<a href="?website={{ selected_website.id }}&page={{ num }}">{{ num }}</a>
|
||||||
|
{% else %}
|
||||||
|
<a href="?page={{ num }}">{{ num }}</a>
|
||||||
|
{% endif %}
|
||||||
|
{% elif num == 1 or num == paginator.num_pages %}
|
||||||
|
{% if selected_website %}
|
||||||
|
<a href="?website={{ selected_website.id }}&page={{ num }}">{{ num }}</a>
|
||||||
|
{% else %}
|
||||||
|
<a href="?page={{ num }}">{{ num }}</a>
|
||||||
|
{% endif %}
|
||||||
|
{% elif num == page_obj.number|add:'-3' or num == page_obj.number|add:'3' %}
|
||||||
|
<span class="ellipsis">...</span>
|
||||||
|
{% endif %}
|
||||||
|
{% endfor %}
|
||||||
|
{% endwith %}
|
||||||
|
|
||||||
{% if page_obj.has_next %}
|
{% if page_obj.has_next %}
|
||||||
{% if selected_website %}
|
{% if selected_website %}
|
||||||
<a href="?website={{ selected_website.id }}&page={{ page_obj.next_page_number }}">下一页</a>
|
<a href="?website={{ selected_website.id }}&page={{ page_obj.next_page_number }}">下一页</a>
|
||||||
|
<a href="?website={{ selected_website.id }}&page={{ page_obj.paginator.num_pages }}">末页 »</a>
|
||||||
{% else %}
|
{% else %}
|
||||||
<a href="?page={{ page_obj.next_page_number }}">下一页</a>
|
<a href="?page={{ page_obj.next_page_number }}">下一页</a>
|
||||||
|
<a href="?page={{ page_obj.paginator.num_pages }}">末页 »</a>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
{% endif %}
|
{% endif %}
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
@@ -77,11 +77,11 @@ def process_article(url, website):
|
|||||||
elif website.name == "东方烟草报":
|
elif website.name == "东方烟草报":
|
||||||
# 优化东方烟草报的标题提取逻辑,按优先级尝试多种选择器
|
# 优化东方烟草报的标题提取逻辑,按优先级尝试多种选择器
|
||||||
title_tag = (
|
title_tag = (
|
||||||
soup.find("h1", id="title") or # 特别针对带id="title"的h1标签
|
soup.find("h1", id="title") or # 特别针对带id="title"的h1标签
|
||||||
soup.find("h1") or # 主要标题标签
|
soup.find("h1") or # 主要标题标签
|
||||||
soup.find("title") or # 页面title标签
|
soup.find("title") or # 页面title标签
|
||||||
soup.find("div", class_="title") or # 某些页面可能使用div.title
|
soup.find("div", class_="title") or # 某些页面可能使用div.title
|
||||||
soup.find("h2") # 备选标题标签
|
soup.find("h2") # 备选标题标签
|
||||||
)
|
)
|
||||||
content_tag = soup.find("div", class_="content") # 东方烟草报的内容通常在div.content中
|
content_tag = soup.find("div", class_="content") # 东方烟草报的内容通常在div.content中
|
||||||
# 增加对另一种内容结构的支持
|
# 增加对另一种内容结构的支持
|
||||||
@@ -184,16 +184,16 @@ def full_site_crawler(start_url, website, max_pages=1000):
|
|||||||
parsed_url = urlparse(url)
|
parsed_url = urlparse(url)
|
||||||
path = parsed_url.path
|
path = parsed_url.path
|
||||||
is_article_page = (
|
is_article_page = (
|
||||||
soup.find("div", class_="content") is not None or
|
soup.find("div", class_="content") is not None or
|
||||||
soup.find("div", id="gallery") is not None or
|
soup.find("div", id="gallery") is not None or
|
||||||
soup.find("div", id="ContentText") is not None or
|
soup.find("div", id="ContentText") is not None or
|
||||||
("/content/" in path and len(path) > 20)
|
("/content/" in path and len(path) > 20)
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# 默认判断逻辑
|
# 默认判断逻辑
|
||||||
is_article_page = (
|
is_article_page = (
|
||||||
soup.find("div", class_="content") is not None or
|
soup.find("div", class_="content") is not None or
|
||||||
soup.find("div", id="content") is not None
|
soup.find("div", id="content") is not None
|
||||||
)
|
)
|
||||||
|
|
||||||
# 如果是文章页面,则调用文章处理
|
# 如果是文章页面,则调用文章处理
|
||||||
|
|||||||
Reference in New Issue
Block a user