... | ... | @@ -13,13 +13,17 @@ |
|
|
## 数据英文名称
|
|
|
<!--英文名称,后续流程中所有涉及到英文名称均以此为准,如:partner、shixin、general_taxpayer等-->
|
|
|
```
|
|
|
law_office
|
|
|
law_office
|
|
|
bj_law_office
|
|
|
sh_law_office
|
|
|
```
|
|
|
|
|
|
## 采集网站(采集入口)
|
|
|
<!--采集的入口地址,不能只是一个网站域名,具体到该网站的数据入口-->
|
|
|
```
|
|
|
http://12348.moj.gov.cn/#/publicies/lawdept/lawdept
|
|
|
http://xkyw.bjsf.gov.cn/lawofficeaction.do?method=queryService&chaxun=lvsuocx&ifHq=1&page.currentPage=1&page.totalPerPage=19
|
|
|
http://sh.12348.gov.cn/sites/12348/team.jsp?typeId=cf2ec647762641e897c0d1fd7f474141&zoneId=&sort=All&servant=All&normal=true&award=&member=All&businessArea=All&jbusinessArea=All&partWord=#
|
|
|
```
|
|
|
|
|
|
## 采集频率及采集策略
|
... | ... | @@ -46,6 +50,12 @@ http://12348.moj.gov.cn/#/publicies/lawdept/lawdept |
|
|
```
|
|
|
爬虫名称: law_office
|
|
|
平台: 中国法律服务网-律师事务所
|
|
|
|
|
|
爬虫名称: bj_law_office
|
|
|
平台: 北京市司法局律师管理系统律所查询
|
|
|
|
|
|
爬虫名称: sh_law_office
|
|
|
平台: 上海法网-律师事务所
|
|
|
```
|
|
|
|
|
|
|
... | ... | @@ -63,6 +73,8 @@ http://192.168.109.110/granite/project-gravel/-/tree/develop_adsearch_20210517/s |
|
|
|
|
|
项目入口脚本
|
|
|
http://192.168.109.110/granite/project-gravel/-/blob/develop_adsearch_20210517/scrapy_spiders/gravel_spiders/spiders/law_office.py
|
|
|
http://192.168.109.110/granite/project-gravel/-/blob/develop_adsearch_20210517/scrapy_spiders/gravel_spiders/spiders/bj_law_office.py
|
|
|
http://192.168.109.110/granite/project-gravel/-/blob/develop_adsearch_20210517/scrapy_spiders/gravel_spiders/spiders/sh_law_office.py
|
|
|
|
|
|
代码具体实现板块
|
|
|
http://192.168.109.110/granite/project-gravel/-/tree/develop_adsearch_20210517/scrapy_spiders/gravel_spiders/spiders/law_office_reqs
|
... | ... | @@ -79,6 +91,8 @@ http://192.168.109.110/granite/project-gravel/-/tree/develop_adsearch_20210517/s |
|
|
### 优先级队列说明
|
|
|
```
|
|
|
law_office
|
|
|
bj_law_office
|
|
|
sh_law_office
|
|
|
说明:无特别处理,都是默认优先级10
|
|
|
```
|
|
|
|
... | ... | @@ -87,16 +101,27 @@ law_office |
|
|
## 任务来源
|
|
|
<!--说明爬虫任务的输入。如:来自某个数据库表等。如果来自某个数据库表则应该简要说明该表内的数据是如何维护的。-->
|
|
|
```
|
|
|
中国法网
|
|
|
ajax请求,json数据格式返回
|
|
|
http://12348.moj.gov.cn/lawerdeptlist/getlawerdeptlist
|
|
|
|
|
|
北京律所与上海律所是从html页面中提取
|
|
|
```
|
|
|
|
|
|
## 任务输入参数(样例)
|
|
|
```json
|
|
|
```
|
|
|
{
|
|
|
"platform": "law_office",
|
|
|
"data_type": "list"
|
|
|
}
|
|
|
{
|
|
|
"platform": "bj_law_office",
|
|
|
"data_type": "list"
|
|
|
}
|
|
|
{
|
|
|
"platform": "sh_law_office",
|
|
|
"data_type": "list"
|
|
|
}
|
|
|
```
|
|
|
|
|
|
|
... | ... | @@ -107,13 +132,21 @@ http://12348.moj.gov.cn/lawerdeptlist/getlawerdeptlist |
|
|
"platform": "law_office",
|
|
|
"data_type: "list",
|
|
|
}
|
|
|
{
|
|
|
"platform": "bj_law_office",
|
|
|
"data_type: "list",
|
|
|
}
|
|
|
{
|
|
|
"platform": "sh_law_office",
|
|
|
"data_type: "list",
|
|
|
}
|
|
|
```
|
|
|
|
|
|
### 任务参数说明
|
|
|
<!--特有参数说明,通用参数比如spider_name,task_params,task_src,task_result等不需说明-->
|
|
|
```json
|
|
|
{
|
|
|
"area_code": "law_office",
|
|
|
"platform": "law_office",
|
|
|
"data_type": "list"
|
|
|
}
|
|
|
说明: 任务直接到redis,不经过taskhub
|
... | ... | @@ -130,6 +163,7 @@ log: 日志记录 |
|
|
## 爬虫结果的超级数据
|
|
|
<!--包含所有字段的json数据,每个value都要有样例值-->
|
|
|
```
|
|
|
中国法网
|
|
|
{
|
|
|
"data": [{
|
|
|
"jj": "",
|
... | ... | @@ -164,6 +198,79 @@ log: 日志记录 |
|
|
"proxy_ip": "http://127.0.0.1:8888"
|
|
|
}
|
|
|
|
|
|
北京律所
|
|
|
{
|
|
|
"data": {
|
|
|
"bj_law_office": {
|
|
|
"company_name": "北京腾骏律师事务所",
|
|
|
"credit_no": "31110000306433298G",
|
|
|
"company_status": "正常",
|
|
|
"en_name": " ",
|
|
|
"company_address": "北京市朝阳区霄云里8号1单元12层1501室216",
|
|
|
"organizer": "朝阳区司法局",
|
|
|
"issue_date": " 2014-06-30 00:00:00.0 ",
|
|
|
"capital": "50.0",
|
|
|
"legal_person": "李雯",
|
|
|
"composion_form": "个人",
|
|
|
"e_mail": "liwen20@hotmail.com",
|
|
|
"web_url": null,
|
|
|
"phone": "85188812",
|
|
|
"fax": " 85188812",
|
|
|
"brief_introduction": ""
|
|
|
}
|
|
|
},
|
|
|
"http_code": 200,
|
|
|
"error_msg": "",
|
|
|
"task_result": 1000,
|
|
|
"data_type": "detail",
|
|
|
"spider_start_time": "2021-08-26 11:44:50.409",
|
|
|
"spider_end_time": "2021-08-26 11:44:51.591",
|
|
|
"task_params": {
|
|
|
"platform": "bj_law_office",
|
|
|
"data_type": "list"
|
|
|
},
|
|
|
"metadata": {
|
|
|
"current_page": 15
|
|
|
},
|
|
|
"spider_name": "bj_law_office",
|
|
|
"spider_ip": "10.8.1.42",
|
|
|
"proxy_ip": "http://10.8.6.219:38080"
|
|
|
}
|
|
|
|
|
|
上海律所
|
|
|
{
|
|
|
"data": {
|
|
|
"sh_law_office": {
|
|
|
"company_name": "上海维盈律师事务所",
|
|
|
"lawyer_num": "27",
|
|
|
"license_no": "23101201010322666",
|
|
|
"company_address": "东方路971号钱江大厦24D",
|
|
|
"phone": "50812019",
|
|
|
"legal_person": "赵德强",
|
|
|
"composion_form": "普通合伙",
|
|
|
"company_status": "正常",
|
|
|
"credit_no": "55000011-X",
|
|
|
"issue_date": "2010-01-11",
|
|
|
"organizer": "浦东新区司法局"
|
|
|
}
|
|
|
},
|
|
|
"http_code": 200,
|
|
|
"error_msg": "",
|
|
|
"task_result": 1000,
|
|
|
"data_type": "detail",
|
|
|
"spider_start_time": "2021-08-27 14:18:19.156",
|
|
|
"spider_end_time": "2021-08-27 14:18:19.801",
|
|
|
"task_params": {
|
|
|
"platform": "sh_law_office",
|
|
|
"data_type": "list"
|
|
|
},
|
|
|
"metadata": {
|
|
|
"current_page": 9
|
|
|
},
|
|
|
"spider_name": "sh_law_office",
|
|
|
"spider_ip": "10.8.1.42",
|
|
|
"proxy_ip": "http://10.8.6.219:38080"
|
|
|
}
|
|
|
```
|
|
|
|
|
|
## 实际爬虫结果的数据结构
|
... | ... | |