按产品下载
import requests
import re
import os
import time
from bs4 import BeautifulSoup
def GetPage(url):
page = requests.get(url)
html = page.text
return html
html_doc = GetPage('https://www.alibabacloud.com/help/zh')
# 生成产品列表链接
soup = BeautifulSoup(html_doc, 'html.parser')
index_url = []
baseurl = 'https://www.alibabacloud.com'
for k in soup.find_all(href=re.compile("product")):
index_url.append(baseurl + k.get('href'))
def download_pdf(produturl):
html_doc = GetPage(produturl)
soup = BeautifulSoup(html_doc, 'html.parser')
def get_product_name():
""" 获得产品名 例如:云服务器_ECS """
product_names = soup.find('h3', class_="product-name")
for name in product_names.strings:
product_name = name
return product_name.replace(' ', '')
def get_pdf_title():
""" 获得PDF标题 例如:新功能发布记录 """
product_names = soup.find(class_="download-pdf")
# print(product_names.parent)
product_feature = product_names.parent
pdf_title = product_feature.h3.string
return pdf_title.replace(' ', '')
def get_pdfs_file():
pdf_names = soup.find_all(class_="download-pdf")
for pdfs in pdf_names:
product_name = get_product_name()
product_feature = pdfs.parent
pdf_title = product_feature.h3.string
pdf_url_tag = product_feature.a
pdf_filename = product_name + '_' + pdf_title.replace(' ', '') + '.pdf'
pdf_url_str = str(pdf_url_tag.get('href'))
if pdf_url_str.startswith('//'):
pdf_url_str = ('http:' + pdf_url_str)
if not os.path.exists(product_name):
os.makedirs(product_name)
# print(pdf_filename)
# print(pdf_url_str)
wget_cmd = 'wget ' + pdf_url_str + ' -O "' + product_name + '/' + pdf_filename + '"'
print(wget_cmd)
os.system(wget_cmd)
time.sleep(1)
get_pdfs_file()
# 执行测试
for products in index_url:
download_pdf(products)
time.sleep(1)
# download_pdf('https://www.alibabacloud.com/help/zh/product/147291.htm')
产品太多,按类别移动至文件夹
import requests
import re
import os
import time
from bs4 import BeautifulSoup
def GetPage(url):
page = requests.get(url)
html = page.text
return html
html_doc = GetPage('https://www.alibabacloud.com/help/zh')
soup = BeautifulSoup(html_doc, 'html.parser')
keys = soup.find_all(class_="masonry-list")
for key in keys:
category_dl = key.parent
category = category_dl.dt.h2.string # 产品类别
product_code = key.find('a')
product_text = product_code.string
product = product_text.replace(' ', '') # 产品名称
if not os.path.exists(category):
os.makedirs(category)
cmd = 'mv ' + product + ' ' + category
os.system(cmd)