阿里云文档爬虫

按产品下载

import requests
import re
import os
import time
from bs4 import BeautifulSoup

def GetPage(url):
    page = requests.get(url)
    html = page.text
    return html

html_doc = GetPage('https://www.alibabacloud.com/help/zh')

# 生成产品列表链接

soup = BeautifulSoup(html_doc, 'html.parser')
index_url = []
baseurl = 'https://www.alibabacloud.com'

for k in soup.find_all(href=re.compile("product")):
    index_url.append(baseurl + k.get('href'))


def download_pdf(produturl):

    html_doc = GetPage(produturl)
    soup = BeautifulSoup(html_doc, 'html.parser')

    def get_product_name():
        """ 获得产品名 例如:云服务器_ECS """
        product_names = soup.find('h3', class_="product-name")
        for name in product_names.strings:
            product_name = name

        return product_name.replace(' ', '')

    def get_pdf_title():
        """ 获得PDF标题 例如:新功能发布记录 """
        product_names = soup.find(class_="download-pdf")
        # print(product_names.parent)
        product_feature = product_names.parent
        pdf_title = product_feature.h3.string

        return pdf_title.replace(' ', '')

    def get_pdfs_file():
        pdf_names = soup.find_all(class_="download-pdf")
        for pdfs in pdf_names:
            product_name = get_product_name()
            product_feature = pdfs.parent
            pdf_title = product_feature.h3.string
            pdf_url_tag = product_feature.a
            pdf_filename = product_name + '_' + pdf_title.replace(' ', '') + '.pdf'
            pdf_url_str = str(pdf_url_tag.get('href'))

            if pdf_url_str.startswith('//'):
                pdf_url_str = ('http:' + pdf_url_str)

            if not os.path.exists(product_name):
                os.makedirs(product_name)

            # print(pdf_filename)
            # print(pdf_url_str)

            wget_cmd = 'wget ' + pdf_url_str + ' -O "' + product_name + '/' + pdf_filename + '"'
            print(wget_cmd)
            os.system(wget_cmd)
            time.sleep(1)

    get_pdfs_file()

# 执行测试

for products in index_url:
    download_pdf(products)
    time.sleep(1)

# download_pdf('https://www.alibabacloud.com/help/zh/product/147291.htm')

产品太多,按类别移动至文件夹

import requests
import re
import os
import time
from bs4 import BeautifulSoup

def GetPage(url):
    page = requests.get(url)
    html = page.text
    return html

html_doc = GetPage('https://www.alibabacloud.com/help/zh')

soup = BeautifulSoup(html_doc, 'html.parser')
keys = soup.find_all(class_="masonry-list")

for key in keys:
    category_dl = key.parent
    category = category_dl.dt.h2.string     # 产品类别

    product_code = key.find('a')
    product_text = product_code.string
    product = product_text.replace(' ', '')     # 产品名称

    if not os.path.exists(category):
        os.makedirs(category)

    cmd = 'mv ' + product + ' ' + category
    os.system(cmd)