Python爬虫获取企查查公开的企业信息
2022-10-24 15:45:48
261
{{single.collect_count}}

1、参考博文

https://blog.csdn.net/qq_39295735/article/details/84504848?utm_medium=distribute.pc_relevant.none-task-blog-searchFromBaidu-2.control&depth_1-utm_source=distribute.pc_relevant.none-task-blog-searchFromBaidu-2.control

2、python代码

#!/usr/bin/python3
#-*- coding: utf-8 -*-
 
import urllib.request
import re
import pymysql
 
#记录公司信息的字典,类似C语言的结构体
#字典中的字段包括:company,domain,legal_person,address,email,phone
gCompanyInfo = dict()
 
#伪装爬虫成浏览器
def spider2browser():
    headers=("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0")
    opener = urllib.request.build_opener()
    opener.addheaders = [headers]
    urllib.request.install_opener(opener)
#end of spider2browser
 
#处理企业信息的异常字符,如换行符、空格、查不到等等
def getTargetInfo(result):
    if 0 == len(result):
        return '暂无'
    else:
        info = re.sub(r'\n', "", str(result[0]))  #删除换行符
        info = re.sub(r'<br/>', "", info)  #删除多余标签
        info = re.sub(r' ', "", info) # 删除多余空格
        return info
#end of getTargetInfo()
 
#根据公司名字获取企业详细信息
def getCompanyInfo(chinaCompany):
    companyInfo = {'company':chinaCompany, 'legal_person':'暂无', 'domain':'暂无', 'address':'暂无'}
    
    #转化为机器可以识别带中文的网址,编码类型为unicode。只转换汉字部分,不能全部网址进行转换
    company=urllib.parse.quote(chinaCompany)
    firstUrl="https://www.qichacha.com/search?key=" + company
    #print("visit web:"+testUrl)
 
    #获取法人代表的名字
    searchRet=urllib.request.urlopen(firstUrl).read().decode("utf-8", "ignore")
    matchPat='法定代表人.*?>(.*?)</a>'
    bosses = re.compile(matchPat, re.S).findall(searchRet)
    companyInfo['legal_person'] = getTargetInfo(bosses)
 
    #爬取第一个页面,即搜索企业名字,获得访问企业信息的跳转链接
    matchPat='addSearchIndex.*?href="(.*?)" target="_blank" class="ma_h1"'
    nextUrls =  re.compile(matchPat, re.S).findall(searchRet)
    if 0 == len(nextUrls):
        return companyInfo
    nextUrl =  "https://www.qichacha.com" + str(nextUrls[0])
 
    #爬取第二个页面,获取公司官网
    searchRet=urllib.request.urlopen(nextUrl).read().decode("utf-8", "ignore")
    matchPat = 'data-delay="500" rel="nofollow">(.*?)</a> <a onclick'
    urls=re.compile(matchPat, re.S).findall(searchRet)
    companyInfo['domain'] = getTargetInfo(urls)
 
    #获取公司地址
    matchPat='title="查看地址".*?>(.*?)</a>'
    addresses=re.compile(matchPat, re.S).findall(searchRet)
    companyInfo['address'] =getTargetInfo(addresses)
 
    return companyInfo
#end of getCompanyInfo()
 
#将公司的详细信息写入数据库
def writeInfoToDB(cursor, companyInfo):
    sql = "insert into company_info(company,domain,legal_person,address) values(%s, %s, %s, %s)"
    val = (companyInfo['company'], companyInfo['domain'], companyInfo['legal_person'],companyInfo['address'])
    try:       
        cursor.execute(sql, val)
        db.commit()
        print("Info: 记录公司 %s 成功" % companyInfo['company'])
    except Exception as err:
        db.rollback()
        print("Error: 记录公司 %s 失败" % companyInfo['company'])
        print(err)
    
#end of writeInfoToDB()
    
#=========主流程从这里开始=========#
#从数据库中将所有的企业名单读取出来
db = pymysql.connect("139.159.xxx.xxx","数据库用户名","密码","数据库")
cursor = db.cursor()
cursor.execute("select `company` from `company_list`")
companyList = cursor.fetchall()
 
#伪装爬虫为浏览器
spider2browser()
 
#将所有公司的信息从企查查网址获取,并保存到数据库中
for row in companyList:
    companyInfo = getCompanyInfo(row[0])
    writeInfoToDB(cursor, companyInfo)

3、数据库结果

回帖
全部回帖({{commentCount}})
{{item.user.nickname}} {{item.user.group_title}} {{item.friend_time}}
{{item.content}}
{{item.comment_content_show ? '取消' : '回复'}} 删除
回帖
{{reply.user.nickname}} {{reply.user.group_title}} {{reply.friend_time}}
{{reply.content}}
{{reply.comment_content_show ? '取消' : '回复'}} 删除
回帖
收起
没有更多啦~
{{commentLoading ? '加载中...' : '查看更多评论'}}