diff --git a/core/management/commands/crawl_all_media.py b/core/management/commands/crawl_all_media.py index 064cba2..3c37ae7 100644 --- a/core/management/commands/crawl_all_media.py +++ b/core/management/commands/crawl_all_media.py @@ -8,17 +8,17 @@ class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument('--media', type=str, help='指定要爬取的媒体,用逗号分隔') - parser.add_argument('--platform', type=str, default='all', - help='指定平台类型: all(全部), web(网站), mobile(移动端)') + parser.add_argument('--platform', type=str, default='all', + help='指定平台类型: all(全部), web(网站), mobile(移动端)') def handle(self, *args, **options): media_list = options['media'] platform = options['platform'] - + # 所有中央主流媒体配置 all_media = { 'rmrb': 'crawl_rmrb', - 'xinhua': 'crawl_xinhua', + 'xinhua': 'crawl_xinhua', 'cctv': 'crawl_cctv', 'qiushi': 'crawl_qiushi', 'pla': 'crawl_pla', @@ -39,15 +39,15 @@ class Command(BaseCommand): 'qizhi': 'crawl_qizhi', 'china': 'crawl_china' } - + # 如果指定了特定媒体,则只爬取指定的媒体 if media_list: target_media = [media.strip() for media in media_list.split(',')] else: target_media = list(all_media.keys()) - + self.stdout.write(f"开始批量爬取 {len(target_media)} 家中央主流媒体...") - + for media in target_media: if media in all_media: command_name = all_media[media] @@ -59,17 +59,17 @@ class Command(BaseCommand): self.stdout.write(self.style.ERROR(f"爬取 {media} 失败: {e}")) else: self.stdout.write(self.style.WARNING(f"未知媒体: {media}")) - + self.stdout.write(self.style.SUCCESS("所有中央主流媒体爬取完成")) - + # 显示统计信息 total_websites = Website.objects.count() total_articles = sum([website.article_set.count() for website in Website.objects.all()]) - + self.stdout.write(f"统计信息:") self.stdout.write(f"- 总网站数: {total_websites}") self.stdout.write(f"- 总文章数: {total_articles}") - + # 显示各媒体文章数量 self.stdout.write(f"各媒体文章数量:") for website in Website.objects.all(): diff --git a/core/management/commands/crawl_cctv.py b/core/management/commands/crawl_cctv.py index 2267a7e..cf24b9f 100644 --- a/core/management/commands/crawl_cctv.py +++ b/core/management/commands/crawl_cctv.py @@ -7,13 +7,13 @@ class Command(BaseCommand): help = "全站递归爬取 中央广播电视总台及其子网站、客户端、新媒体平台" def add_arguments(self, parser): - parser.add_argument('--platform', type=str, default='all', - choices=['cctv', 'cctvnews', 'mobile', 'all'], - help='选择爬取平台: cctv(央视网), cctvnews(央视新闻), mobile(移动端), all(全部)') + parser.add_argument('--platform', type=str, default='all', + choices=['cctv', 'cctvnews', 'mobile', 'all'], + help='选择爬取平台: cctv(央视网), cctvnews(央视新闻), mobile(移动端), all(全部)') def handle(self, *args, **options): platform = options['platform'] - + # 中央广播电视总台各平台配置 platforms = { 'cctv': { @@ -35,12 +35,12 @@ class Command(BaseCommand): 'article_selector': 'a' } } - + if platform == 'all': target_platforms = platforms.values() else: target_platforms = [platforms[platform]] - + for platform_config in target_platforms: website, created = Website.objects.get_or_create( name=platform_config['name'], @@ -50,16 +50,16 @@ class Command(BaseCommand): 'article_selector': platform_config['article_selector'] } ) - + # 确保更新已存在的网站对象的配置 if not created: website.base_url = platform_config['base_url'] website.article_list_url = platform_config['start_url'] website.article_selector = platform_config['article_selector'] website.save() - + self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}") full_site_crawler(platform_config['start_url'], website, max_pages=500) self.stdout.write(f"完成爬取: {platform_config['name']}") - + self.stdout.write(self.style.SUCCESS("中央广播电视总台所有平台爬取完成")) diff --git a/core/management/commands/crawl_china.py b/core/management/commands/crawl_china.py index 5114a3e..2cd5eaf 100644 --- a/core/management/commands/crawl_china.py +++ b/core/management/commands/crawl_china.py @@ -7,13 +7,13 @@ class Command(BaseCommand): help = "全站递归爬取 中国网主网及中国网一省份,不转发二级子网站" def add_arguments(self, parser): - parser.add_argument('--platform', type=str, default='all', - choices=['china', 'province', 'all'], - help='选择爬取平台: china(中国网主网), province(中国网一省份), all(全部)') + parser.add_argument('--platform', type=str, default='all', + choices=['china', 'province', 'all'], + help='选择爬取平台: china(中国网主网), province(中国网一省份), all(全部)') def handle(self, *args, **options): platform = options['platform'] - + # 中国网各平台配置 platforms = { 'china': { @@ -29,12 +29,12 @@ class Command(BaseCommand): 'article_selector': 'a' } } - + if platform == 'all': target_platforms = platforms.values() else: target_platforms = [platforms[platform]] - + for platform_config in target_platforms: website, created = Website.objects.get_or_create( name=platform_config['name'], @@ -44,16 +44,16 @@ class Command(BaseCommand): 'article_selector': platform_config['article_selector'] } ) - + # 确保更新已存在的网站对象的配置 if not created: website.base_url = platform_config['base_url'] website.article_list_url = platform_config['start_url'] website.article_selector = platform_config['article_selector'] website.save() - + self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}") full_site_crawler(platform_config['start_url'], website, max_pages=500) self.stdout.write(f"完成爬取: {platform_config['name']}") - + self.stdout.write(self.style.SUCCESS("中国网所有平台爬取完成")) diff --git a/core/management/commands/crawl_chinadaily.py b/core/management/commands/crawl_chinadaily.py index fe6d426..69b5e7f 100644 --- a/core/management/commands/crawl_chinadaily.py +++ b/core/management/commands/crawl_chinadaily.py @@ -7,13 +7,13 @@ class Command(BaseCommand): help = "全站递归爬取 中国日报及其子网站、客户端、新媒体平台" def add_arguments(self, parser): - parser.add_argument('--platform', type=str, default='all', - choices=['chinadaily', 'mobile', 'all'], - help='选择爬取平台: chinadaily(中国日报), mobile(移动端), all(全部)') + parser.add_argument('--platform', type=str, default='all', + choices=['chinadaily', 'mobile', 'all'], + help='选择爬取平台: chinadaily(中国日报), mobile(移动端), all(全部)') def handle(self, *args, **options): platform = options['platform'] - + # 中国日报各平台配置 platforms = { 'chinadaily': { @@ -29,12 +29,12 @@ class Command(BaseCommand): 'article_selector': 'a' } } - + if platform == 'all': target_platforms = platforms.values() else: target_platforms = [platforms[platform]] - + for platform_config in target_platforms: website, created = Website.objects.get_or_create( name=platform_config['name'], @@ -44,16 +44,16 @@ class Command(BaseCommand): 'article_selector': platform_config['article_selector'] } ) - + # 确保更新已存在的网站对象的配置 if not created: website.base_url = platform_config['base_url'] website.article_list_url = platform_config['start_url'] website.article_selector = platform_config['article_selector'] website.save() - + self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}") full_site_crawler(platform_config['start_url'], website, max_pages=500) self.stdout.write(f"完成爬取: {platform_config['name']}") - + self.stdout.write(self.style.SUCCESS("中国日报所有平台爬取完成")) diff --git a/core/management/commands/crawl_chinanews.py b/core/management/commands/crawl_chinanews.py index 00f29c1..4583c58 100644 --- a/core/management/commands/crawl_chinanews.py +++ b/core/management/commands/crawl_chinanews.py @@ -7,13 +7,13 @@ class Command(BaseCommand): help = "全站递归爬取 中国新闻社及其子网站、客户端、新媒体平台" def add_arguments(self, parser): - parser.add_argument('--platform', type=str, default='all', - choices=['chinanews', 'mobile', 'all'], - help='选择爬取平台: chinanews(中国新闻社), mobile(移动端), all(全部)') + parser.add_argument('--platform', type=str, default='all', + choices=['chinanews', 'mobile', 'all'], + help='选择爬取平台: chinanews(中国新闻社), mobile(移动端), all(全部)') def handle(self, *args, **options): platform = options['platform'] - + # 中国新闻社各平台配置 platforms = { 'chinanews': { @@ -29,12 +29,12 @@ class Command(BaseCommand): 'article_selector': 'a' } } - + if platform == 'all': target_platforms = platforms.values() else: target_platforms = [platforms[platform]] - + for platform_config in target_platforms: website, created = Website.objects.get_or_create( name=platform_config['name'], @@ -44,16 +44,16 @@ class Command(BaseCommand): 'article_selector': platform_config['article_selector'] } ) - + # 确保更新已存在的网站对象的配置 if not created: website.base_url = platform_config['base_url'] website.article_list_url = platform_config['start_url'] website.article_selector = platform_config['article_selector'] website.save() - + self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}") full_site_crawler(platform_config['start_url'], website, max_pages=500) self.stdout.write(f"完成爬取: {platform_config['name']}") - + self.stdout.write(self.style.SUCCESS("中国新闻社所有平台爬取完成")) diff --git a/core/management/commands/crawl_fzrb.py b/core/management/commands/crawl_fzrb.py index a19e2b2..fbc0251 100644 --- a/core/management/commands/crawl_fzrb.py +++ b/core/management/commands/crawl_fzrb.py @@ -7,13 +7,13 @@ class Command(BaseCommand): help = "全站递归爬取 法治日报及其子网站、客户端、新媒体平台" def add_arguments(self, parser): - parser.add_argument('--platform', type=str, default='all', - choices=['fzrb', 'mobile', 'all'], - help='选择爬取平台: fzrb(法治日报), mobile(移动端), all(全部)') + parser.add_argument('--platform', type=str, default='all', + choices=['fzrb', 'mobile', 'all'], + help='选择爬取平台: fzrb(法治日报), mobile(移动端), all(全部)') def handle(self, *args, **options): platform = options['platform'] - + # 法治日报各平台配置 platforms = { 'fzrb': { @@ -29,12 +29,12 @@ class Command(BaseCommand): 'article_selector': 'a' } } - + if platform == 'all': target_platforms = platforms.values() else: target_platforms = [platforms[platform]] - + for platform_config in target_platforms: website, created = Website.objects.get_or_create( name=platform_config['name'], @@ -44,16 +44,16 @@ class Command(BaseCommand): 'article_selector': platform_config['article_selector'] } ) - + # 确保更新已存在的网站对象的配置 if not created: website.base_url = platform_config['base_url'] website.article_list_url = platform_config['start_url'] website.article_selector = platform_config['article_selector'] website.save() - + self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}") full_site_crawler(platform_config['start_url'], website, max_pages=500) self.stdout.write(f"完成爬取: {platform_config['name']}") - + self.stdout.write(self.style.SUCCESS("法治日报所有平台爬取完成")) diff --git a/core/management/commands/crawl_gmrb.py b/core/management/commands/crawl_gmrb.py index 5a4f3f2..72cd6ca 100644 --- a/core/management/commands/crawl_gmrb.py +++ b/core/management/commands/crawl_gmrb.py @@ -7,13 +7,13 @@ class Command(BaseCommand): help = "全站递归爬取 光明日报及其子网站、客户端、新媒体平台" def add_arguments(self, parser): - parser.add_argument('--platform', type=str, default='all', - choices=['gmrb', 'mobile', 'all'], - help='选择爬取平台: gmrb(光明日报), mobile(移动端), all(全部)') + parser.add_argument('--platform', type=str, default='all', + choices=['gmrb', 'mobile', 'all'], + help='选择爬取平台: gmrb(光明日报), mobile(移动端), all(全部)') def handle(self, *args, **options): platform = options['platform'] - + # 光明日报各平台配置 platforms = { 'gmrb': { @@ -29,12 +29,12 @@ class Command(BaseCommand): 'article_selector': 'a' } } - + if platform == 'all': target_platforms = platforms.values() else: target_platforms = [platforms[platform]] - + for platform_config in target_platforms: website, created = Website.objects.get_or_create( name=platform_config['name'], @@ -44,16 +44,16 @@ class Command(BaseCommand): 'article_selector': platform_config['article_selector'] } ) - + # 确保更新已存在的网站对象的配置 if not created: website.base_url = platform_config['base_url'] website.article_list_url = platform_config['start_url'] website.article_selector = platform_config['article_selector'] website.save() - + self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}") full_site_crawler(platform_config['start_url'], website, max_pages=500) self.stdout.write(f"完成爬取: {platform_config['name']}") - + self.stdout.write(self.style.SUCCESS("光明日报所有平台爬取完成")) diff --git a/core/management/commands/crawl_grrb.py b/core/management/commands/crawl_grrb.py index 689eb25..a688c4f 100644 --- a/core/management/commands/crawl_grrb.py +++ b/core/management/commands/crawl_grrb.py @@ -7,13 +7,13 @@ class Command(BaseCommand): help = "全站递归爬取 工人日报及其子网站、客户端、新媒体平台" def add_arguments(self, parser): - parser.add_argument('--platform', type=str, default='all', - choices=['grrb', 'mobile', 'all'], - help='选择爬取平台: grrb(工人日报), mobile(移动端), all(全部)') + parser.add_argument('--platform', type=str, default='all', + choices=['grrb', 'mobile', 'all'], + help='选择爬取平台: grrb(工人日报), mobile(移动端), all(全部)') def handle(self, *args, **options): platform = options['platform'] - + # 工人日报各平台配置 platforms = { 'grrb': { @@ -29,12 +29,12 @@ class Command(BaseCommand): 'article_selector': 'a' } } - + if platform == 'all': target_platforms = platforms.values() else: target_platforms = [platforms[platform]] - + for platform_config in target_platforms: website, created = Website.objects.get_or_create( name=platform_config['name'], @@ -44,16 +44,16 @@ class Command(BaseCommand): 'article_selector': platform_config['article_selector'] } ) - + # 确保更新已存在的网站对象的配置 if not created: website.base_url = platform_config['base_url'] website.article_list_url = platform_config['start_url'] website.article_selector = platform_config['article_selector'] website.save() - + self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}") full_site_crawler(platform_config['start_url'], website, max_pages=500) self.stdout.write(f"完成爬取: {platform_config['name']}") - - self.stdout.write(self.style.SUCCESS("工人日报所有平台爬取完成")) \ No newline at end of file + + self.stdout.write(self.style.SUCCESS("工人日报所有平台爬取完成")) diff --git a/core/management/commands/crawl_jjrb.py b/core/management/commands/crawl_jjrb.py index 2a2e14f..c4ce837 100644 --- a/core/management/commands/crawl_jjrb.py +++ b/core/management/commands/crawl_jjrb.py @@ -7,13 +7,13 @@ class Command(BaseCommand): help = "全站递归爬取 经济日报及其子网站、客户端、新媒体平台" def add_arguments(self, parser): - parser.add_argument('--platform', type=str, default='all', - choices=['jjrb', 'mobile', 'all'], - help='选择爬取平台: jjrb(经济日报), mobile(移动端), all(全部)') + parser.add_argument('--platform', type=str, default='all', + choices=['jjrb', 'mobile', 'all'], + help='选择爬取平台: jjrb(经济日报), mobile(移动端), all(全部)') def handle(self, *args, **options): platform = options['platform'] - + # 经济日报各平台配置 platforms = { 'jjrb': { @@ -23,12 +23,12 @@ class Command(BaseCommand): 'article_selector': 'a' }, } - + if platform == 'all': target_platforms = platforms.values() else: target_platforms = [platforms[platform]] - + for platform_config in target_platforms: website, created = Website.objects.get_or_create( name=platform_config['name'], @@ -38,16 +38,16 @@ class Command(BaseCommand): 'article_selector': platform_config['article_selector'] } ) - + # 确保更新已存在的网站对象的配置 if not created: website.base_url = platform_config['base_url'] website.article_list_url = platform_config['start_url'] website.article_selector = platform_config['article_selector'] website.save() - + self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}") full_site_crawler(platform_config['start_url'], website, max_pages=500) self.stdout.write(f"完成爬取: {platform_config['name']}") - + self.stdout.write(self.style.SUCCESS("经济日报所有平台爬取完成")) diff --git a/core/management/commands/crawl_nmrb.py b/core/management/commands/crawl_nmrb.py index e93f8dd..ee71913 100644 --- a/core/management/commands/crawl_nmrb.py +++ b/core/management/commands/crawl_nmrb.py @@ -7,13 +7,13 @@ class Command(BaseCommand): help = "全站递归爬取 农民日报及其子网站、客户端、新媒体平台" def add_arguments(self, parser): - parser.add_argument('--platform', type=str, default='all', - choices=['nmrb', 'mobile', 'all'], - help='选择爬取平台: nmrb(农民日报), mobile(移动端), all(全部)') + parser.add_argument('--platform', type=str, default='all', + choices=['nmrb', 'mobile', 'all'], + help='选择爬取平台: nmrb(农民日报), mobile(移动端), all(全部)') def handle(self, *args, **options): platform = options['platform'] - + # 农民日报各平台配置 platforms = { 'nmrb': { @@ -29,12 +29,12 @@ class Command(BaseCommand): 'article_selector': 'a' } } - + if platform == 'all': target_platforms = platforms.values() else: target_platforms = [platforms[platform]] - + for platform_config in target_platforms: website, created = Website.objects.get_or_create( name=platform_config['name'], @@ -44,16 +44,16 @@ class Command(BaseCommand): 'article_selector': platform_config['article_selector'] } ) - + # 确保更新已存在的网站对象的配置 if not created: website.base_url = platform_config['base_url'] website.article_list_url = platform_config['start_url'] website.article_selector = platform_config['article_selector'] website.save() - + self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}") full_site_crawler(platform_config['start_url'], website, max_pages=500) self.stdout.write(f"完成爬取: {platform_config['name']}") - + self.stdout.write(self.style.SUCCESS("农民日报所有平台爬取完成")) diff --git a/core/management/commands/crawl_pla.py b/core/management/commands/crawl_pla.py index 1243ec9..8041cdc 100644 --- a/core/management/commands/crawl_pla.py +++ b/core/management/commands/crawl_pla.py @@ -7,13 +7,13 @@ class Command(BaseCommand): help = "全站递归爬取 解放军报及其子网站、客户端、新媒体平台" def add_arguments(self, parser): - parser.add_argument('--platform', type=str, default='all', - choices=['pla', 'mobile', 'all'], - help='选择爬取平台: pla(解放军报), mobile(移动端), all(全部)') + parser.add_argument('--platform', type=str, default='all', + choices=['pla', 'mobile', 'all'], + help='选择爬取平台: pla(解放军报), mobile(移动端), all(全部)') def handle(self, *args, **options): platform = options['platform'] - + # 解放军报各平台配置 platforms = { 'pla': { @@ -23,12 +23,12 @@ class Command(BaseCommand): 'article_selector': 'a' }, } - + if platform == 'all': target_platforms = platforms.values() else: target_platforms = [platforms[platform]] - + for platform_config in target_platforms: website, created = Website.objects.get_or_create( name=platform_config['name'], @@ -38,16 +38,16 @@ class Command(BaseCommand): 'article_selector': platform_config['article_selector'] } ) - + # 确保更新已存在的网站对象的配置 if not created: website.base_url = platform_config['base_url'] website.article_list_url = platform_config['start_url'] website.article_selector = platform_config['article_selector'] website.save() - + self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}") full_site_crawler(platform_config['start_url'], website, max_pages=500) self.stdout.write(f"完成爬取: {platform_config['name']}") - + self.stdout.write(self.style.SUCCESS("解放军报所有平台爬取完成")) diff --git a/core/management/commands/crawl_qiushi.py b/core/management/commands/crawl_qiushi.py index f5eab0b..3107f64 100644 --- a/core/management/commands/crawl_qiushi.py +++ b/core/management/commands/crawl_qiushi.py @@ -7,13 +7,13 @@ class Command(BaseCommand): help = "全站递归爬取 求是杂志及其子网站、客户端、新媒体平台" def add_arguments(self, parser): - parser.add_argument('--platform', type=str, default='all', - choices=['qiushi', 'mobile', 'all'], - help='选择爬取平台: qiushi(求是网), mobile(移动端), all(全部)') + parser.add_argument('--platform', type=str, default='all', + choices=['qiushi', 'mobile', 'all'], + help='选择爬取平台: qiushi(求是网), mobile(移动端), all(全部)') def handle(self, *args, **options): platform = options['platform'] - + # 求是杂志各平台配置 platforms = { 'qiushi': { @@ -29,12 +29,12 @@ class Command(BaseCommand): 'article_selector': 'a' } } - + if platform == 'all': target_platforms = platforms.values() else: target_platforms = [platforms[platform]] - + for platform_config in target_platforms: website, created = Website.objects.get_or_create( name=platform_config['name'], @@ -44,16 +44,16 @@ class Command(BaseCommand): 'article_selector': platform_config['article_selector'] } ) - + # 确保更新已存在的网站对象的配置 if not created: website.base_url = platform_config['base_url'] website.article_list_url = platform_config['start_url'] website.article_selector = platform_config['article_selector'] website.save() - + self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}") full_site_crawler(platform_config['start_url'], website, max_pages=500) self.stdout.write(f"完成爬取: {platform_config['name']}") - + self.stdout.write(self.style.SUCCESS("求是杂志所有平台爬取完成")) diff --git a/core/management/commands/crawl_qizhi.py b/core/management/commands/crawl_qizhi.py index 19008fb..d6a36ba 100644 --- a/core/management/commands/crawl_qizhi.py +++ b/core/management/commands/crawl_qizhi.py @@ -7,13 +7,13 @@ class Command(BaseCommand): help = "全站递归爬取 旗帜网及其子网站、客户端、新媒体平台" def add_arguments(self, parser): - parser.add_argument('--platform', type=str, default='all', - choices=['qizhi', 'mobile', 'all'], - help='选择爬取平台: qizhi(旗帜网), mobile(移动端), all(全部)') + parser.add_argument('--platform', type=str, default='all', + choices=['qizhi', 'mobile', 'all'], + help='选择爬取平台: qizhi(旗帜网), mobile(移动端), all(全部)') def handle(self, *args, **options): platform = options['platform'] - + # 旗帜网各平台配置 platforms = { 'qizhi': { @@ -29,12 +29,12 @@ class Command(BaseCommand): 'article_selector': 'a[href^="/"]' # 修改选择器以更好地匹配文章链接 } } - + if platform == 'all': target_platforms = platforms.values() else: target_platforms = [platforms[platform]] - + for platform_config in target_platforms: website, created = Website.objects.get_or_create( name=platform_config['name'], @@ -44,16 +44,16 @@ class Command(BaseCommand): 'article_selector': platform_config['article_selector'] } ) - + # 确保更新已存在的网站对象的配置 if not created: website.base_url = platform_config['base_url'] website.article_list_url = platform_config['start_url'] website.article_selector = platform_config['article_selector'] website.save() - + self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}") full_site_crawler(platform_config['start_url'], website, max_pages=500) self.stdout.write(f"完成爬取: {platform_config['name']}") - - self.stdout.write(self.style.SUCCESS("旗帜网所有平台爬取完成")) \ No newline at end of file + + self.stdout.write(self.style.SUCCESS("旗帜网所有平台爬取完成")) diff --git a/core/management/commands/crawl_rmrb.py b/core/management/commands/crawl_rmrb.py index 160a6de..27169b1 100644 --- a/core/management/commands/crawl_rmrb.py +++ b/core/management/commands/crawl_rmrb.py @@ -7,13 +7,13 @@ class Command(BaseCommand): help = "全站递归爬取 人民日报及其子网站、客户端、新媒体平台" def add_arguments(self, parser): - parser.add_argument('--platform', type=str, default='all', - choices=['peopleapp', 'people', 'paper', 'all'], - help='选择爬取平台: peopleapp(客户端), people(人民网), paper(报纸), all(全部)') + parser.add_argument('--platform', type=str, default='all', + choices=['peopleapp', 'people', 'paper', 'all'], + help='选择爬取平台: peopleapp(客户端), people(人民网), paper(报纸), all(全部)') def handle(self, *args, **options): platform = options['platform'] - + # 人民日报各平台配置 platforms = { 'peopleapp': { @@ -35,12 +35,12 @@ class Command(BaseCommand): 'article_selector': 'a' } } - + if platform == 'all': target_platforms = platforms.values() else: target_platforms = [platforms[platform]] - + for platform_config in target_platforms: website, created = Website.objects.get_or_create( name=platform_config['name'], @@ -50,16 +50,16 @@ class Command(BaseCommand): 'article_selector': platform_config['article_selector'] } ) - + # 确保更新已存在的网站对象的配置 if not created: website.base_url = platform_config['base_url'] website.article_list_url = platform_config['start_url'] website.article_selector = platform_config['article_selector'] website.save() - + self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}") full_site_crawler(platform_config['start_url'], website, max_pages=500) self.stdout.write(f"完成爬取: {platform_config['name']}") - - self.stdout.write(self.style.SUCCESS("人民日报所有平台爬取完成")) \ No newline at end of file + + self.stdout.write(self.style.SUCCESS("人民日报所有平台爬取完成")) diff --git a/core/management/commands/crawl_rmzxb.py b/core/management/commands/crawl_rmzxb.py index 1ff9fc9..9f49fdc 100644 --- a/core/management/commands/crawl_rmzxb.py +++ b/core/management/commands/crawl_rmzxb.py @@ -50,4 +50,4 @@ class Command(BaseCommand): full_site_crawler(platform_config['start_url'], website, max_pages=500) self.stdout.write(f"完成爬取: {platform_config['name']}") - self.stdout.write(self.style.SUCCESS("人民政协网所有平台爬取完成")) \ No newline at end of file + self.stdout.write(self.style.SUCCESS("人民政协网所有平台爬取完成")) diff --git a/core/management/commands/crawl_xinhua.py b/core/management/commands/crawl_xinhua.py index b2b4393..729ade0 100644 --- a/core/management/commands/crawl_xinhua.py +++ b/core/management/commands/crawl_xinhua.py @@ -7,13 +7,13 @@ class Command(BaseCommand): help = "全站递归爬取 新华社及其子网站、客户端、新媒体平台" def add_arguments(self, parser): - parser.add_argument('--platform', type=str, default='all', - choices=['news', 'xinhuanet', 'mobile', 'all'], - help='选择爬取平台: news(新华网), xinhuanet(新华网主站), mobile(移动端), all(全部)') + parser.add_argument('--platform', type=str, default='all', + choices=['news', 'xinhuanet', 'mobile', 'all'], + help='选择爬取平台: news(新华网), xinhuanet(新华网主站), mobile(移动端), all(全部)') def handle(self, *args, **options): platform = options['platform'] - + # 新华社各平台配置 platforms = { 'news': { @@ -35,12 +35,12 @@ class Command(BaseCommand): 'article_selector': 'a' } } - + if platform == 'all': target_platforms = platforms.values() else: target_platforms = [platforms[platform]] - + for platform_config in target_platforms: website, created = Website.objects.get_or_create( name=platform_config['name'], @@ -50,16 +50,16 @@ class Command(BaseCommand): 'article_selector': platform_config['article_selector'] } ) - + # 确保更新已存在的网站对象的配置 if not created: website.base_url = platform_config['base_url'] website.article_list_url = platform_config['start_url'] website.article_selector = platform_config['article_selector'] website.save() - + self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}") full_site_crawler(platform_config['start_url'], website, max_pages=500) self.stdout.write(f"完成爬取: {platform_config['name']}") - + self.stdout.write(self.style.SUCCESS("新华社所有平台爬取完成")) diff --git a/core/management/commands/crawl_xuexi.py b/core/management/commands/crawl_xuexi.py index b4ba45e..5486913 100644 --- a/core/management/commands/crawl_xuexi.py +++ b/core/management/commands/crawl_xuexi.py @@ -7,13 +7,13 @@ class Command(BaseCommand): help = "全站递归爬取 学习强国中央媒体学习号及省级以上学习平台" def add_arguments(self, parser): - parser.add_argument('--platform', type=str, default='all', - choices=['xuexi', 'central', 'provincial', 'all'], - help='选择爬取平台: xuexi(学习强国主站), central(中央媒体), provincial(省级平台), all(全部)') + parser.add_argument('--platform', type=str, default='all', + choices=['xuexi', 'central', 'provincial', 'all'], + help='选择爬取平台: xuexi(学习强国主站), central(中央媒体), provincial(省级平台), all(全部)') def handle(self, *args, **options): platform = options['platform'] - + # 学习强国各平台配置 platforms = { 'xuexi': { @@ -35,12 +35,12 @@ class Command(BaseCommand): 'article_selector': 'a' } } - + if platform == 'all': target_platforms = platforms.values() else: target_platforms = [platforms[platform]] - + for platform_config in target_platforms: website, created = Website.objects.get_or_create( name=platform_config['name'], @@ -50,16 +50,16 @@ class Command(BaseCommand): 'article_selector': platform_config['article_selector'] } ) - + # 确保更新已存在的网站对象的配置 if not created: website.base_url = platform_config['base_url'] website.article_list_url = platform_config['start_url'] website.article_selector = platform_config['article_selector'] website.save() - + self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}") full_site_crawler(platform_config['start_url'], website, max_pages=500) self.stdout.write(f"完成爬取: {platform_config['name']}") - + self.stdout.write(self.style.SUCCESS("学习强国所有平台爬取完成")) diff --git a/core/management/commands/crawl_xxsb.py b/core/management/commands/crawl_xxsb.py index a43a141..c73ca4e 100644 --- a/core/management/commands/crawl_xxsb.py +++ b/core/management/commands/crawl_xxsb.py @@ -7,13 +7,13 @@ class Command(BaseCommand): help = "全站递归爬取 学习时报及其子网站、客户端、新媒体平台" def add_arguments(self, parser): - parser.add_argument('--platform', type=str, default='all', - choices=['xxsb', 'mobile', 'all'], - help='选择爬取平台: xxsb(学习时报), mobile(移动端), all(全部)') + parser.add_argument('--platform', type=str, default='all', + choices=['xxsb', 'mobile', 'all'], + help='选择爬取平台: xxsb(学习时报), mobile(移动端), all(全部)') def handle(self, *args, **options): platform = options['platform'] - + # 学习时报各平台配置 platforms = { 'xxsb': { @@ -29,12 +29,12 @@ class Command(BaseCommand): 'article_selector': 'a' } } - + if platform == 'all': target_platforms = platforms.values() else: target_platforms = [platforms[platform]] - + for platform_config in target_platforms: website, created = Website.objects.get_or_create( name=platform_config['name'], @@ -44,16 +44,16 @@ class Command(BaseCommand): 'article_selector': platform_config['article_selector'] } ) - + # 确保更新已存在的网站对象的配置 if not created: website.base_url = platform_config['base_url'] website.article_list_url = platform_config['start_url'] website.article_selector = platform_config['article_selector'] website.save() - + self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}") full_site_crawler(platform_config['start_url'], website, max_pages=500) self.stdout.write(f"完成爬取: {platform_config['name']}") - + self.stdout.write(self.style.SUCCESS("学习时报所有平台爬取完成")) diff --git a/core/management/commands/crawl_zgfnb.py b/core/management/commands/crawl_zgfnb.py index 3e5302e..f33bb9f 100644 --- a/core/management/commands/crawl_zgfnb.py +++ b/core/management/commands/crawl_zgfnb.py @@ -7,13 +7,13 @@ class Command(BaseCommand): help = "全站递归爬取 中国妇女报及其子网站、客户端、新媒体平台" def add_arguments(self, parser): - parser.add_argument('--platform', type=str, default='all', - choices=['zgfnb', 'mobile', 'all'], - help='选择爬取平台: zgfnb(中国妇女报), mobile(移动端), all(全部)') + parser.add_argument('--platform', type=str, default='all', + choices=['zgfnb', 'mobile', 'all'], + help='选择爬取平台: zgfnb(中国妇女报), mobile(移动端), all(全部)') def handle(self, *args, **options): platform = options['platform'] - + # 中国妇女报各平台配置 platforms = { 'zgfnb': { @@ -29,12 +29,12 @@ class Command(BaseCommand): 'article_selector': 'a' } } - + if platform == 'all': target_platforms = platforms.values() else: target_platforms = [platforms[platform]] - + for platform_config in target_platforms: website, created = Website.objects.get_or_create( name=platform_config['name'], @@ -44,16 +44,16 @@ class Command(BaseCommand): 'article_selector': platform_config['article_selector'] } ) - + # 确保更新已存在的网站对象的配置 if not created: website.base_url = platform_config['base_url'] website.article_list_url = platform_config['start_url'] website.article_selector = platform_config['article_selector'] website.save() - + self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}") full_site_crawler(platform_config['start_url'], website, max_pages=500) self.stdout.write(f"完成爬取: {platform_config['name']}") - + self.stdout.write(self.style.SUCCESS("中国妇女报所有平台爬取完成")) diff --git a/core/management/commands/crawl_zgjwjc.py b/core/management/commands/crawl_zgjwjc.py index d123859..7c509b2 100644 --- a/core/management/commands/crawl_zgjwjc.py +++ b/core/management/commands/crawl_zgjwjc.py @@ -7,13 +7,13 @@ class Command(BaseCommand): help = "全站递归爬取 中国纪检监察报及其子网站、客户端、新媒体平台" def add_arguments(self, parser): - parser.add_argument('--platform', type=str, default='all', - choices=['zgjwjc', 'mobile', 'all'], - help='选择爬取平台: zgjwjc(中国纪检监察报), mobile(移动端), all(全部)') + parser.add_argument('--platform', type=str, default='all', + choices=['zgjwjc', 'mobile', 'all'], + help='选择爬取平台: zgjwjc(中国纪检监察报), mobile(移动端), all(全部)') def handle(self, *args, **options): platform = options['platform'] - + # 中国纪检监察报各平台配置 platforms = { 'zgjwjc': { @@ -29,12 +29,12 @@ class Command(BaseCommand): 'article_selector': 'a' } } - + if platform == 'all': target_platforms = platforms.values() else: target_platforms = [platforms[platform]] - + for platform_config in target_platforms: website, created = Website.objects.get_or_create( name=platform_config['name'], @@ -44,16 +44,16 @@ class Command(BaseCommand): 'article_selector': platform_config['article_selector'] } ) - + # 确保更新已存在的网站对象的配置 if not created: website.base_url = platform_config['base_url'] website.article_list_url = platform_config['start_url'] website.article_selector = platform_config['article_selector'] website.save() - + self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}") full_site_crawler(platform_config['start_url'], website, max_pages=500) self.stdout.write(f"完成爬取: {platform_config['name']}") - + self.stdout.write(self.style.SUCCESS("中国纪检监察报所有平台爬取完成")) diff --git a/core/management/commands/crawl_zgqnb.py b/core/management/commands/crawl_zgqnb.py index 9127b93..fcc2081 100644 --- a/core/management/commands/crawl_zgqnb.py +++ b/core/management/commands/crawl_zgqnb.py @@ -7,13 +7,13 @@ class Command(BaseCommand): help = "全站递归爬取 中国青年报及其子网站、客户端、新媒体平台" def add_arguments(self, parser): - parser.add_argument('--platform', type=str, default='all', - choices=['zgqnb', 'mobile', 'all'], - help='选择爬取平台: zgqnb(中国青年报), mobile(移动端), all(全部)') + parser.add_argument('--platform', type=str, default='all', + choices=['zgqnb', 'mobile', 'all'], + help='选择爬取平台: zgqnb(中国青年报), mobile(移动端), all(全部)') def handle(self, *args, **options): platform = options['platform'] - + # 中国青年报各平台配置 platforms = { 'zgqnb': { @@ -29,12 +29,12 @@ class Command(BaseCommand): 'article_selector': 'a' } } - + if platform == 'all': target_platforms = platforms.values() else: target_platforms = [platforms[platform]] - + for platform_config in target_platforms: website, created = Website.objects.get_or_create( name=platform_config['name'], @@ -44,16 +44,16 @@ class Command(BaseCommand): 'article_selector': platform_config['article_selector'] } ) - + # 确保更新已存在的网站对象的配置 if not created: website.base_url = platform_config['base_url'] website.article_list_url = platform_config['start_url'] website.article_selector = platform_config['article_selector'] website.save() - + self.stdout.write(f"开始爬取: {platform_config['name']} - {platform_config['start_url']}") full_site_crawler(platform_config['start_url'], website, max_pages=500) self.stdout.write(f"完成爬取: {platform_config['name']}") - + self.stdout.write(self.style.SUCCESS("中国青年报所有平台爬取完成")) diff --git a/core/templates/core/article_detail.html b/core/templates/core/article_detail.html index 5fe39af..e101736 100644 --- a/core/templates/core/article_detail.html +++ b/core/templates/core/article_detail.html @@ -77,7 +77,8 @@
网站: {{ article.website.name }} | 发布时间: {{ article.pub_date|date:"Y-m-d H:i" }} | - 创建时间: {{ article.created_at|date:"Y-m-d H:i" }} + 创建时间: {{ article.created_at|date:"Y-m-d H:i" }} | + 源网址: {{ article.url }}
diff --git a/core/utils.py b/core/utils.py index 6bec751..634fbfd 100644 --- a/core/utils.py +++ b/core/utils.py @@ -126,7 +126,8 @@ def process_article(url, website): soup.find("div", class_="content") or soup.find("div", id="content") or soup.find("div", class_="article") or - soup.find("div", class_="main-content") + soup.find("div", class_="main-content") or + soup.find("span", id="detailContent") # 添加新华网特有的内容容器 ) elif website.name == "东方烟草报": # 优化东方烟草报的标题提取逻辑,按优先级尝试多种选择器 @@ -177,45 +178,45 @@ def process_article(url, website): soup.find("div", class_="rm_txt_con") or # 添加人民网特有的内容容器 soup.find("div", class_="text_c") # 添加新的内容容器 ) - + # 针对人民网的特殊处理,清理内容中的无关元素 if content_tag: # 移除编辑信息 for editor_element in content_tag.find_all("div", class_="edit"): editor_element.decompose() - + # 移除分享相关元素 for share_element in content_tag.find_all("p", class_="paper_num"): share_element.decompose() - + # 移除无关的box_pic元素 for pic_element in content_tag.find_all("div", class_="box_pic"): pic_element.decompose() - + # 移除无关的zdfy元素 for zdfy_element in content_tag.find_all("div", class_="zdfy"): zdfy_element.decompose() - + # 移除无关的center元素 for center_element in content_tag.find_all("center"): center_element.decompose() - + # 移除无关的bza元素 for bza_element in content_tag.find_all("div", class_="bza"): bza_element.decompose() - + # 移除隐藏的无关元素 for hidden_element in content_tag.find_all(attrs={"style": "display: none;"}): hidden_element.decompose() - + # 移除相关专题 for related_element in content_tag.find_all("div", id="rwb_tjyd"): related_element.decompose() - + # 移除推荐阅读 for recommend_element in content_tag.find_all("div", class_="clearfix box_cai"): recommend_element.decompose() - + # 移除相关专题列表 for topic_element in content_tag.find_all("div", class_="clearfix text_like"): topic_element.decompose() @@ -414,7 +415,7 @@ def process_article(url, website): title_text = first_p.find("strong").get_text().strip() # 创建一个虚拟的title_tag对象 title_tag = first_p.find("strong") - + content_tag = ( soup.find("div", class_="content") or soup.find("div", class_="article-content") or @@ -425,27 +426,28 @@ def process_article(url, website): soup.find("div", class_="article-body") or soup.find("div", class_="text_box") # 添加人民政协网特有内容容器 ) - + # 针对人民政协网的特殊处理,清理内容中的无关元素 if content_tag: # 移除编辑信息 for editor_element in content_tag.find_all("p", class_="Editor"): editor_element.decompose() - + # 移除分享相关元素 for share_element in content_tag.find_all("div", class_="share"): share_element.decompose() - + # 移除Remark元素 for remark_element in content_tag.find_all("div", class_="Remark"): remark_element.decompose() - + # 移除Paging元素 for paging_element in content_tag.find_all("div", class_="Paging"): paging_element.decompose() - + # 移除政协号客户端下载提示 - for zxh_element in content_tag.find_all("div", style=lambda x: x and "background:#F9F9F9;padding:50px" in x): + for zxh_element in content_tag.find_all("div", + style=lambda x: x and "background:#F9F9F9;padding:50px" in x): zxh_element.decompose() # 移除版权信息 @@ -503,6 +505,7 @@ def process_article(url, website): soup.find("title") ) content_tag = ( + soup.find("div", id="detail") or # 添加学习时报特有内容容器 soup.find("div", class_="content") or soup.find("div", class_="article-content") or soup.find("div", id="content") or @@ -511,6 +514,24 @@ def process_article(url, website): soup.find("div", class_="article") or soup.find("div", class_="article-body") ) + + # 针对学习时报的特殊处理,清理内容中的无关元素 + if content_tag: + # 移除编辑信息 + for editor_element in content_tag.find_all("div", class_="editor"): + editor_element.decompose() + + # 移除分享相关元素 + for share_element in content_tag.find_all("div", class_="share"): + share_element.decompose() + + # 移除无关的TRS_Editor包装层 + for trs_editor in content_tag.find_all("div", class_="TRS_Editor"): + trs_editor.unwrap() # unwrap只移除标签,保留内容 + + # 移除Custom_UnionStyle包装层 + for custom_style in content_tag.find_all("div", class_="Custom_UnionStyle"): + custom_style.unwrap() # unwrap只移除标签,保留内容 elif "中国青年报" in website.name or "cyol" in website.name: # 中国青年报的文章结构处理 - 修复无法爬取问题 title_tag = ( @@ -532,6 +553,7 @@ def process_article(url, website): title_tag = ( soup.find("h1", class_="title") or soup.find("h1") or + soup.find("p", class_="f_container_title") or # 添加中国妇女报特有标题容器 soup.find("title") ) content_tag = ( @@ -541,7 +563,9 @@ def process_article(url, website): soup.find("div", class_="text") or soup.find("div", class_="main-content") or soup.find("div", class_="article") or - soup.find("div", class_="article-body") + soup.find("div", class_="article-body") or + soup.find("div", class_="f_container_left") or # 添加中国妇女报特有内容容器 + soup.find("div", class_="f_container") # 添加另一种可能的内容容器 ) elif "法治日报" in website.name or "legaldaily" in website.name: # 法治日报的文章结构处理 - 修复无法爬取问题 @@ -604,7 +628,7 @@ def process_article(url, website): elif "旗帜网" in website.name or "qizhiwang" in website.name: # 旗帜网的文章结构处理 - 修复不保存文章内容问题 title_tag = ( - soup.find("div", class_="w1200 flag-text-tit clearfix") and + soup.find("div", class_="w1200 flag-text-tit clearfix") and soup.find("div", class_="w1200 flag-text-tit clearfix").find("h1") or soup.find("h1", class_="title") or soup.find("h1") or @@ -620,29 +644,29 @@ def process_article(url, website): soup.find("div", class_="article") or soup.find("div", class_="article-body") ) - + # 针对旗帜网的特殊处理,清理内容中的无关元素 if content_tag: # 移除编辑信息 for editor_element in content_tag.find_all("p", class_="editor"): editor_element.decompose() - + # 移除分享相关元素 for share_element in content_tag.find_all("div", class_="share-demo"): share_element.decompose() - + # 移除文字缩放相关元素 for scale_element in content_tag.find_all("div", class_="scale-main"): scale_element.decompose() - + # 移除无关的div.pic元素 for pic_element in content_tag.find_all("div", class_="pic"): pic_element.decompose() - + # 移除无关的zdfy元素 for zdfy_element in content_tag.find_all("div", class_="zdfy"): zdfy_element.decompose() - + # 移除无关的center元素 for center_element in content_tag.find_all("center"): center_element.decompose() @@ -714,6 +738,28 @@ def process_article(url, website): if not src: src = video.get("data-url") or video.get("data-video") + # 新增:检查新华网特有的视频源属性 + if not src: + src = video.get("data-video-src") + + # 新增:针对新华网的特殊处理,从复杂播放器结构中提取视频源 + if not src and "新华网" in website.name: + # 尝试从video标签的属性中直接获取src + for attr in video.attrs: + if 'src' in attr.lower(): + src = video.attrs.get(attr) + break + + # 如果还是没有找到,尝试查找父容器中的视频源信息 + if not src: + parent = video.parent + if parent and parent.name == 'div' and 'player-container' in parent.get('class', []): + # 检查是否有data-*属性包含视频信息 + for attr, value in parent.attrs.items(): + if 'data' in attr and isinstance(value, str) and ('.mp4' in value or 'video' in value): + src = value + break + if not src: continue @@ -726,6 +772,10 @@ def process_article(url, website): if "cctv.com" in src or "cntv.cn" in src: print(f"发现央视视频: {src}") + # 针对新华网的特殊处理 + elif "新华网" in website.name: + print(f"发现新华网视频: {src}") + local_path = download_media(src, save_dir) if local_path: rel_path = os.path.relpath(local_path, settings.MEDIA_ROOT) @@ -812,6 +862,7 @@ def full_site_crawler(start_url, website, max_pages=1000): soup.find("div", id="content") is not None or soup.find("div", class_="article") is not None or soup.find("div", class_="main-content") is not None or + soup.find("span", id="detailContent") is not None or # 添加新华网特有内容容器判断 ("/news/" in path) or ("/article/" in path) or (path.startswith("/detail/") and len(path) > 10) @@ -1064,6 +1115,7 @@ def full_site_crawler(start_url, website, max_pages=1000): soup.find("div", class_="main-content") is not None or soup.find("div", class_="article") is not None or soup.find("div", class_="article-body") is not None or + soup.find("div", id="detail") is not None or # 添加学习时报特有内容容器判断 ("/article/" in path) or ("/content/" in path) or (path.startswith("/detail/") and len(path) > 10) @@ -1168,7 +1220,6 @@ def full_site_crawler(start_url, website, max_pages=1000): soup.find("div", class_="article-content") is not None or (soup.find("div", id="content") is not None and soup.find("h1") is not None) or - soup.find("div", class_="text") is not None or soup.find("div", class_="main-content") is not None or soup.find("div", class_="article") is not None or soup.find("div", class_="article-body") is not None or @@ -1220,7 +1271,7 @@ def full_site_crawler(start_url, website, max_pages=1000): if ("/article/" in href_path or href_path.startswith("/detail/") or ("/dynamic/" in href_path and "article" in href_path) or - href_path.count("/") > 2): # 更深层的页面可能是文章页 + href_path.count("/") > 2): # 更深层 queue.append(href) elif href not in visited and is_valid_url(href, base_netloc): queue.append(href)