python爬虫篇4——爬取专利著作权信息

python爬⾍篇4——爬取专利著作权信息mysql代码:
CREATE TABLE `copyright` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`软件名称` varchar(500) DEFAULT NULL,
`登记号` varchar(500) DEFAULT NULL,
`分类号` varchar(500) DEFAULT NULL,
`软件简称` varchar(500) DEFAULT NULL,
`版本号` varchar(500) DEFAULT NULL,
`⾸次发表⽇期` varchar(500) DEFAULT NULL,
`登记批准⽇期` varchar(500) DEFAULT NULL,
`软件著作权⼈` varchar(500) DEFAULT NULL,
`软件著作权⼈详情` varchar(500) DEFAULT NULL,
PRIMARY KEY (`id`),
UNIQUE KEY `登记号` (`登记号`)
) ENGINE=InnoDB AUTO_INCREMENT=9871 DEFAULT CHARSET=utf8
CREATE TABLE `patent` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`专利名称` varchar(500) DEFAULT NULL,
`发明⼈` varchar(500) DEFAULT NULL,
`申请⼈` varchar(500) DEFAULT NULL,
`申请⽇` datetime DEFAULT NULL,
`公开⽇` datetime DEFAULT NULL,
`详情地址` varchar(500) DEFAULT NULL,
PRIMARY KEY (`id`),
UNIQUE KEY `详情地址` (`详情地址`)
) ENGINE=InnoDB AUTO_INCREMENT=13610 DEFAULT CHARSET=utf8
python代码:
getpatentdata.py主程序
import re
from urllib.parse import unquote, quote
from lxml import etree
from requests_html import HTMLSession
from 抓取专利著作权信息.MysqlHelper import MysqlHelper
# 获取专利信息
class Patent:
def __init__(self, sqr, year):
self.helper = MysqlHelper(host='localhost',
port=8080,
user='root',
passwd='123',
db='students',
charset='utf8')
self.sum = 0
while year <= 2019:
if year >= 2016:
dateList = ["%s-01-01" % str(year), "%s-02-01" % str(year), "%s-03-01" % str(year),
"%s-04-01" % str(year), "%s-05-01" % str(year), "%s-06-01" % str(year),
"%s-07-01" % str(year), "%s-08-01" % str(year), "%s-09-01" % str(year),
"%s-10-01" % str(year), "%s-11-01" % str(year), "%s-12-01" % str(year),
"%s-12-31" % str(year)]
"%s-12-31" % str(year)]
else:
dateList = ["%s-01-01" % str(year), "%s-03-01" % str(year), "%s-05-01" % str(year),
"%s-07-01" % str(year),
"%s-09-01" % str(year),
"%s-11-01" % str(year), "%s-12-31" % str(year)]
print("*" * 66)
print("\033[36m开始抓取%s年的专利数据,已累计抓取%s条数据\033[0m" % (str(year), self.sum))
print("*" * 66)
for i in range(len(dateList) - 1):
else:
year += 1
else:
print("\033[34m专利数据抓取完毕!共抓取%s条数据\033[0m" % str(self.sum))
showFunction()
# 建表
def creatTable(self):
sql1 = "CREATE TABLE `patent` (`id` int primary key not null auto_increment,`专利名称` varchar(500) DEFAULT NULL  ,`发明⼈` varchar(500) DEFAULT NU        ute(sql1)
# 获取数据 sqr申请⼈ sqday_start申请⽇开始 sqday_end申请⽇结束
def getPatent(self, sqr, sqday_start, sqday_end):
self.patent_url = "dbpubki/Grid2008/Dbpub/Brief.aspx?curpage=8&RecordsPerPage=350&QueryID=64&ID=SCPD&turnpage=1&systemno=&Nav        self.session = HTMLSession()
self.add_url = "dbpubki/Grid2008/Dbpub/"
self.headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'gzip,deflate',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded',
'Origin': 'dbpubki',
'Host': 'dbpubki',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0'
}
# h = self.session.post(self.patent_url, headers=headers)
# pagenum = etree.HTML(h.html.html).xpath('//div[@id="id_grid_total"]/text()')[0][5:-3]
# print("共%s条数据" % pagenum)
# self.patent_url = self.patent_url + "&RecordsPerPage=" + pagenum
self.data = "ID=SCPD&hdnSearchType=&hdnIsAll=false&NaviField=%E4%B8%93%E9%A2%98%E5%AD%90%E6%A0%8F%E7%9B%AE%E4%BB%A3%E7%            sqr) + "&imageField.x=50&imageField.y=11&searchmatch=0&order=dec&RecordsPerPage=350&hdnUSPSubDB=%E4%B8%93%E5%88%A9%E7%B1%BB        # print(unquote(datas))
# )
patentdata = ''
while len(patentdata) == 0:
self.h1 = self.session.post(self.patent_url, headers=self.headers, data=self.data)
patentdata = etree.HTML().xpath('//table[@class="s_table"]//tr')
print("\033[31m从%s 到 %s 共有%s条专利数据\033[0m" % (sqday_start, sqday_end, len(patentdata) - 1))
# 数据写⼊数据库
for i in range(1, len(patentdata)):
item = patentdata[i]
# number = item.xpath('./td[@class="s_tabletd_rb"]')[0].xpath('string(.)')
patentname = item.xpath('./td[@class="s_tabletd_rb"]')[1].xpath('string(.)')
patentpeople = item.xpath('./td[@class="s_tabletd_rb"]')[2].xpath('string(.)')
sqpeople = item.xpath('./td[@class="s_tabletd_rb"]')[3].xpath('string(.)')
sqday = item.xpath('./td[@class="s_tabletd_rb"]')[4].xpath('string(.)')
openday = item.xpath('./td[@class="s_tabletd_rb"]')[5].xpath('string(.)')
address = self.add_url + item.xpath('./td[@class="s_tabletd_rb"]//a/@href')[0]
# print("*" * 66)
sql = "insert into patent(`专利名称`,`发明⼈`,`申请⼈` ,`申请⽇`,`公开⽇` ,`详情地址`) values(%s,%s,%s,%s,%s,%s)on duplicate key update `专利名称` = %s            params = [patentname, patentpeople, sqpeople, sqday, openday, address, patentname,
patentpeople, sqpeople, sqday, openday, address]
result = ute(sql, params)
if str(result).__contains__('1292'):
else:
print(str(i) + '.' + patentname + '  数据⼊库成功!')
self.sum += 1
# 获取软件著作权信息
class Copyright:
def __init__(self, key):
self.helper = MysqlHelper(host='localhost',
port=8080,
user='root',
passwd='123',
db='students',
charset='utf8')
# 建表
def creatTable(self):
sql1 = "CREATE TABLE `copyright` (`id` int primary key not null auto_increment,`软件名称` varchar(500) DEFAULT NULL  ,`登记号` varchar(500) DEFAULT        ute(sql1)
def getCopyrightData(self, key):
self.add_url = 'www.qichacha'
self.page = 1
self.sum = 0
self.session = HTMLSession()
self.headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'acw_tc=9dff1e1d15740724795763997e1d4fc677c413795a13ba5e12a187111d; QCCSESSID=4koqg095imku2ge3616s51au67; _uab_collina=15            'Host': 'www.qichacha',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0'
}
try:
self.h = (pyright_url, headers=self.headers)
except Exception:
self.h = (pyright_url, headers=self.headers)
# print()
# 获取总页数
pagesum = etree.HTML().xpath('//a[@class="end"]/text()')[0]
while self.page <= int(pagesum):
print("*" * 66)
print('\033[31m开始抓取第%s页的数据,共%s页\033[0m' % (self.page, pagesum))
while not pyrightdata):
self.h = (pyright_url, headers=self.headers)
# 数据写⼊数据库
print("*" * 66)
for item pyrightdata:
# 软件名称
copyrightname = item.xpath('.//span[@class="name"]')[0].xpath('string(.)')
djh_and_flh = re.split(r'[::]',
re.sub(r'\s+', '',
item.xpath('.//small[@class="text-muted clear text-ellipsis m-t-xs"]')[
0].xpath(
'string(.)')))
djh = re.findall(r'(.*?)分类号', djh_and_flh[1])[0]
# 分类号
flh = djh_and_flh[2]
if not len(djh):
djh = '空'
if not len(flh):
flh = '空'
rjjc_and_bbh = re.split(r':', re.sub(r'\s+', '', item.xpath(
'.//small[@class="text-muted clear text-ellipsis m-t-xs"]')[1].xpath('string(.)')))
# 软件简称
rjname = re.findall(r'(.*?)版本号', rjjc_and_bbh[1])[0]
# 版本号
bbh = rjjc_and_bbh[2]
fbtime_and_pztime = re.split(r':', re.sub(r'\s+', '', item.xpath(
'.//small[@class="text-muted clear text-ellipsis m-t-xs"]')[2].xpath('string(.)')))
# ⾸次发表⽇期
fbtime = re.findall(r'(.*?)登记批准⽇期', fbtime_and_pztime[1])[0]
# 登记批准⽇期
pztime = fbtime_and_pztime[2]
if djh == '-':
djh = ''
if flh == '-':
flh = ''
# 软件著作权⼈
rjzzqr = re.split(r':', re.sub(r'\s+', '',
item.xpath('.//footer [@class="panel-footer clear"]')[0].xpath(
'string(.)')))[1]
# 软件著作权⼈详情
try:
rjurl = self.add_url + item.xpath('.//footer [@class="panel-footer clear"]/a/@href')[0]
except IndexError:
rjurl = '空'
sql = "insert into copyright(`软件名称`,`登记号`,`分类号` ,`软件简称`,`版本号` ,`⾸次发表⽇期`,`登记批准⽇期`,`软件著作权⼈`,`软件著作权⼈详情`) values(                params = [copyrightname, djh, flh, rjname, bbh, fbtime, pztime,
rjzzqr, rjurl, copyrightname, djh, flh, rjname, bbh, fbtime, pztime,
rjzzqr, rjurl]
result = ute(sql, params)
if str(result).__contains__('1292'):
pass
else:
print(copyrightname + ',' +
djh + ',' + flh + ',' + rjname + ',' + bbh + ',' + fbtime + ',' + pztime + ',' + rjzzqr + ',' + rjurl)
print('数据⼊库成功!')
# print("*" * 66)
# 累计数据数量
self.sum += 1
else:
print('\033[34m累计抓取数据%s条!\033[0m' % self.sum)
self.page += 1
else:
print("\033[34m著作权数据抓取完毕!共抓取%s条数据\033[0m" % str(self.sum))
showFunction()
# 展⽰功能菜单
def showFunction():
print("*" * 66)
print("\t\t\t\t\t专利著作权信息下载⼯具V1.0\t\t\t\t\t")
print("*" * 66)
print("\033[34m请选择功能\n1.抓取全部专利数据\n2.已抓取全部专利数据,执⾏更新数据操作\n3.抓取全部著作权数据\n4.退出程序\033[0m")
print("*" * 66)
point = True
while point:
fuc = input('请输⼊功能序号:')
if not fuc.isdigit():
print("\033[31m输⼊错误,请输⼊功能序号!\033[0m")
point = True
elif int(fuc) == 1:
strs = input("请输⼊申请⼈关键词(直接回车键返回上⼀级):")            if not len(strs):
point = True
else:
Patent(strs, 1985)
point = False
elif int(fuc) == 2:
strs = input("请输⼊申请⼈关键词(直接回车键返回上⼀级):")            if not len(strs):
point = True
else:
Patent(strs, 2019)
point = False
elif int(fuc) == 3:
strs = input("请输⼊著作权关键词(直接回车键返回上⼀级):")            if not len(strs):
point = True
else:
Copyright(strs)
point = False
elif int(fuc) == 4:
print('程序已关闭...')
exit()
else:
print("\033[31m输⼊错误,请输⼊正确的功能序号!\033[0m")            point = True
# Patent("江西")
# Patent("南昌")
if __name__ == '__main__':
showFunction()
MysqlHelper.py数据库辅助连接类:
from click._compat import raw_input
from pymysql import *
"""封装mysql连接类"""
class MysqlHelper:
"""初始化数据库参数"""
def __init__(self, host, port, user, passwd, db, charset):
# 数据库连接地址
self.host = host
# 地址端⼝
self.port = port
# 数据库⽤户名
self.user = user
# 数据库密码
self.passwd = passwd
# 数据库名称
self.db = db
# 编码
self.charset = charset

本文发布于:2024-09-22 21:25:40,感谢您对本站的认可!

本文链接:https://www.17tex.com/tex/2/418520.html

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。

下一篇:KEY TESTER
标签:抓取   数据   专利
留言与评论(共有 0 条评论)
   
验证码:
Copyright ©2019-2024 Comsenz Inc.Powered by © 易纺专利技术学习网 豫ICP备2022007602号 豫公网安备41160202000603 站长QQ:729038198 关于我们 投诉建议