... | @@ -25,6 +25,14 @@ environmental_protection_grade |
... | @@ -25,6 +25,14 @@ environmental_protection_grade |
|
浙江:http://223.4.71.96/portal/data/api/auto
|
|
浙江:http://223.4.71.96/portal/data/api/auto
|
|
福建:http://220.160.52.213:20071/api/template/page/p_list_eval_credit
|
|
福建:http://220.160.52.213:20071/api/template/page/p_list_eval_credit
|
|
四川:http://103.203.219.138:18081/data/w/evaluateResults/list4Public
|
|
四川:http://103.203.219.138:18081/data/w/evaluateResults/list4Public
|
|
|
|
湖南:http://218.76.24.162:5014/hnxypjqyd/xxgk/queryXxgsQy
|
|
|
|
河南:http://222.143.24.250:8127/credit_publicService/system/company/systemcompanyinfo/getPublicResultListPage.do
|
|
|
|
湖北:http://113.57.151.5:8030/HBHB/companyInfo.action
|
|
|
|
广东:https://www-app.gdeei.cn/gdeepub/data/industry
|
|
|
|
贵州:http://202.98.194.198:6661/wwgs/xypj/public/credit/ratingpublicity/primaryPublicity.jsp
|
|
|
|
广西:http://202.103.233.156:9081/xypjgx/pages/xypj/wzgs/qypjjgList.jsp
|
|
|
|
河北:http://110.249.223.66:8099/xypjww/xypj/listEntEvaluate
|
|
|
|
辽宁:http://221.180.204.224:8080/LiaoNingQiYeXinYongPingJia/display/getEnterpriseInfo.do
|
|
...
|
|
...
|
|
|
|
|
|
采集文件存放路径:
|
|
采集文件存放路径:
|
... | @@ -40,7 +48,6 @@ environmental_protection_grade |
... | @@ -40,7 +48,6 @@ environmental_protection_grade |
|
<!--无需更新?每天全量更新?逐条轮询更新?多久更新完一轮?或其他-->
|
|
<!--无需更新?每天全量更新?逐条轮询更新?多久更新完一轮?或其他-->
|
|
```text
|
|
```text
|
|
目前全量更新一轮
|
|
目前全量更新一轮
|
|
|
|
|
|
```
|
|
```
|
|
|
|
|
|
|
|
|
... | @@ -76,9 +83,9 @@ environmental_protection |
... | @@ -76,9 +83,9 @@ environmental_protection |
|
## 队列名称及队列地址
|
|
## 队列名称及队列地址
|
|
<!--redis host port db key 优先级说明-->
|
|
<!--redis host port db key 优先级说明-->
|
|
-
|
|
-
|
|
* redis host: redis://:utn@0818@bdp-mq-001.redis.rds.aliyuncs.com:6379/0
|
|
* redis host: redis://:utn@0818@bdp-mq-001.redis.rds.aliyuncs.com:6379/7
|
|
* redis port: 6379
|
|
* redis port: 6379
|
|
* redis db: 0
|
|
* redis db: 7
|
|
* redis key:
|
|
* redis key:
|
|
* environmental_protection
|
|
* environmental_protection
|
|
|
|
|
... | @@ -88,26 +95,33 @@ environmental_protection |
... | @@ -88,26 +95,33 @@ environmental_protection |
|
## 任务来源
|
|
## 任务来源
|
|
<!--说明爬虫任务的输入。如:来自某个数据库表等。如果来自某个数据库表则应该简要说明该表内的数据是如何维护的。-->
|
|
<!--说明爬虫任务的输入。如:来自某个数据库表等。如果来自某个数据库表则应该简要说明该表内的数据是如何维护的。-->
|
|
```buildoutcfg
|
|
```buildoutcfg
|
|
|
|
导入任务配置文件路径:http://tech.pingansec.com/granite/project-gravel/-/blob/develop_environmental_protection_grade/app_environmental_protection_grade/data_pump/normal_list_task.yml
|
|
```
|
|
```
|
|
|
|
|
|
## 任务输入参数(样例)
|
|
## 任务输入参数(样例)
|
|
其中CRNo(公司编号)为必需
|
|
|
|
```json
|
|
|
|
{'province': 'jiangsu', "step": "start"}
|
|
|
|
{'province': 'zhejiang', "step": "start"}
|
|
|
|
{'province': 'fujian', "step": "start"}
|
|
|
|
{'province': 'sichuan', "step": "start"}
|
|
|
|
```
|
|
|
|
|
|
|
|
### 任务样例
|
|
### 任务样例
|
|
<!--注意是爬虫拿到的完整任务,而不仅仅是task_params-->
|
|
<!--注意是爬虫拿到的完整任务,而不仅仅是task_params-->
|
|
```buildoutcfg
|
|
```json
|
|
|
|
{"province": "jiangsu", "step": "start"},
|
|
|
|
{"province": "zhejiang", "step": "start"},
|
|
|
|
{"province": "fujian", "step": "start"},
|
|
|
|
{"province": "sichuan", "step": "start"},
|
|
|
|
{"province": "hunan", "step": "start"},
|
|
|
|
{"province": "henan", "step": "start"},
|
|
|
|
{"province": "hubei", "step": "start"},
|
|
|
|
{"province": "guangdong", "step": "start"},
|
|
|
|
{"province": "guizhou", "step": "start"},
|
|
|
|
{"province": "guangxi", "step": "start"},
|
|
|
|
{"province": "hebei"},
|
|
|
|
{"province": "liaoning"}
|
|
```
|
|
```
|
|
|
|
|
|
### 任务参数说明
|
|
### 任务参数说明
|
|
<!--特有参数说明,通用参数比如spider_name,task_params,task_src,task_result等不需说明-->
|
|
<!--特有参数说明,通用参数比如spider_name,task_params,task_src,task_result等不需说明-->
|
|
```json
|
|
```json
|
|
{'province': 'jiangsu', "step": "start", "index": 0}
|
|
{'province': 'henan', "step": "start", "index": 0, "city": "郑州市"}
|
|
|
|
{'province': 'guizhou', "step": "start", "index": 0, "year": 2019}
|
|
```
|
|
```
|
|
|
|
|
|
> + 主要参数
|
|
> + 主要参数
|
... | @@ -115,25 +129,18 @@ environmental_protection |
... | @@ -115,25 +129,18 @@ environmental_protection |
|
> + 非必要参数
|
|
> + 非必要参数
|
|
> + step: 步骤
|
|
> + step: 步骤
|
|
> + index: 翻页的页数
|
|
> + index: 翻页的页数
|
|
|
|
> + city: 地市,仅河南省有该字段
|
|
|
|
> + year: 年份,仅贵州省有该字段
|
|
|
|
|
|
|
|
|
|
## data_type说明
|
|
## data_type说明
|
|
<!--可能产生的data_type说明-->
|
|
<!--可能产生的data_type说明-->
|
|
```buildoutcfg
|
|
```buildoutcfg
|
|
list: 列表页数据
|
|
list: 列表页数据
|
|
|
|
|
|
福建:
|
|
|
|
list_of_normal: 全程公示
|
|
|
|
list_of_red: 红黑榜
|
|
|
|
|
|
|
|
...开发中
|
|
|
|
```
|
|
```
|
|
|
|
|
|
## 爬虫结果的超级数据
|
|
## 爬虫结果的超级数据
|
|
<!--包含所有字段的json数据,每个value都要有样例值-->
|
|
<!--包含所有字段的json数据,每个value都要有样例值-->
|
|
```text
|
|
|
|
|
|
|
|
```
|
|
|
|
|
|
|
|
## 实际爬虫结果的数据结构
|
|
## 实际爬虫结果的数据结构
|
|
<!--可能与超级数据一致,可能不同的data_type的爬虫结果结构不同,超级数据是把所有data_type的结果组合在一起-->
|
|
<!--可能与超级数据一致,可能不同的data_type的爬虫结果结构不同,超级数据是把所有data_type的结果组合在一起-->
|
... | @@ -205,15 +212,15 @@ list_of_red: 红黑榜 |
... | @@ -205,15 +212,15 @@ list_of_red: 红黑榜 |
|
"data":
|
|
"data":
|
|
[
|
|
[
|
|
{
|
|
{
|
|
"city": "舟山市",
|
|
"city": "舟山市", # 市
|
|
"level_title": "优秀",
|
|
"level_title": "优秀",
|
|
"district": "普陀区",
|
|
"district": "普陀区", # 县
|
|
"level_code": "A",
|
|
"level_code": "A", # 信用等级
|
|
"social_credit_code": "91330903336897819J",
|
|
"social_credit_code": "91330903336897819J", # 统一社会信用代码
|
|
"score_code": "8ca03c8e-634c-49cb-b35c-bac95ce46910",
|
|
"score_code": "8ca03c8e-634c-49cb-b35c-bac95ce46910",
|
|
"ent_name": "舟山丰瑞海洋生物制品有限公司",
|
|
"ent_name": "舟山丰瑞海洋生物制品有限公司",
|
|
"region_code": "330903",
|
|
"region_code": "330903",
|
|
"release_time": 1635264000000
|
|
"release_time": 1635264000000 # 更新时间
|
|
},
|
|
},
|
|
{
|
|
{
|
|
"city": "舟山市",
|
|
"city": "舟山市",
|
... | @@ -269,13 +276,13 @@ list_of_red: 红黑榜 |
... | @@ -269,13 +276,13 @@ list_of_red: 红黑榜 |
|
"id": 11795,
|
|
"id": 11795,
|
|
"social_credit_code": "91350504MA2Y13F352",
|
|
"social_credit_code": "91350504MA2Y13F352",
|
|
"credit_year_batch": "2021年第二批",
|
|
"credit_year_batch": "2021年第二批",
|
|
"ent_name": "泉州佰份佰卫生用品有限公司",
|
|
"ent_name": "泉州佰份佰卫生用品有限公司", # 企业名称
|
|
"county": "洛江区",
|
|
"county": "洛江区", # 区县
|
|
"city": "泉州市",
|
|
"city": "泉州市", # 地市
|
|
"deptName": "泉州市洛江生态环境局",
|
|
"deptName": "泉州市洛江生态环境局", # 评价单位
|
|
"createTime": "2021-06-03",
|
|
"createTime": "2021-06-03", # 评价时间
|
|
"credit": "79",
|
|
"credit": "79",
|
|
"credit_type": "环保良好企业"
|
|
"credit_type": "环保良好企业" # 信用等级
|
|
},
|
|
},
|
|
{
|
|
{
|
|
"id": 11794,
|
|
"id": 11794,
|
... | @@ -312,7 +319,7 @@ list_of_red: 红黑榜 |
... | @@ -312,7 +319,7 @@ list_of_red: 红黑榜 |
|
}
|
|
}
|
|
```
|
|
```
|
|
|
|
|
|
#### 四川:
|
|
#### 四川: # 参照字段解析
|
|
```json
|
|
```json
|
|
{
|
|
{
|
|
"data":
|
|
"data":
|
... | @@ -322,7 +329,7 @@ list_of_red: 红黑榜 |
... | @@ -322,7 +329,7 @@ list_of_red: 红黑榜 |
|
"enterprise":
|
|
"enterprise":
|
|
{
|
|
{
|
|
"id": 354497052,
|
|
"id": 354497052,
|
|
"name": "阆中市枣碧大梁山页岩机砖厂",
|
|
"name": "阆中市枣碧大梁山页岩机砖厂", # 参照字段解析
|
|
"creditCode": "92511381MA695TYH83",
|
|
"creditCode": "92511381MA695TYH83",
|
|
"orgCode": "MA695TYH8",
|
|
"orgCode": "MA695TYH8",
|
|
"enterpriseType": "PRIVATE_OWNED",
|
|
"enterpriseType": "PRIVATE_OWNED",
|
... | @@ -449,7 +456,7 @@ list_of_red: 红黑榜 |
... | @@ -449,7 +456,7 @@ list_of_red: 红黑榜 |
|
"spider_ip": "10.8.1.18"
|
|
"spider_ip": "10.8.1.18"
|
|
}
|
|
}
|
|
```
|
|
```
|
|
> [四川字段解析](data_stream/environmental_protection_related/sichuan_field.json)
|
|
> [字段解析](data_stream/environmental_protection_related/field)
|
|
|
|
|
|
#### 湖南:
|
|
#### 湖南:
|
|
```json
|
|
```json
|
... | @@ -756,6 +763,135 @@ list_of_red: 红黑榜 |
... | @@ -756,6 +763,135 @@ list_of_red: 红黑榜 |
|
}
|
|
}
|
|
```
|
|
```
|
|
|
|
|
|
|
|
#### 河北:
|
|
|
|
```json
|
|
|
|
{
|
|
|
|
"data":
|
|
|
|
[
|
|
|
|
{
|
|
|
|
"qyid": 16622,
|
|
|
|
"qymc": "邯郸新兴发电有限责任公司", # 企业名称
|
|
|
|
"orgcode": "911304817840855238", # 企业代码
|
|
|
|
"qyxzname": "重点排污单位",
|
|
|
|
"id": 16153,
|
|
|
|
"stime": "1585908614273",
|
|
|
|
"etime": null,
|
|
|
|
"sfhmd": 0,
|
|
|
|
"ljpf": 96, # 评分
|
|
|
|
"qybz": 1, # 企业标识 1:A类企业、2:B类企业、3:C类企业、4:D类企业、5:E类企业
|
|
|
|
"sfcp": null,
|
|
|
|
"hstime": null,
|
|
|
|
"hetime": null,
|
|
|
|
"gstime": "1599548668767", # 更新时间戳?不确定是否该字段
|
|
|
|
"zq_stime": "1585908614273",
|
|
|
|
"zq_etime": null,
|
|
|
|
"sfww": null,
|
|
|
|
"hmdtime": null,
|
|
|
|
"qybzname": null,
|
|
|
|
"ljpfs": null,
|
|
|
|
"xqcount": 0
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"qyid": 2511,
|
|
|
|
"qymc": "中普(邯郸)钢铁有限公司",
|
|
|
|
"orgcode": "75025136X",
|
|
|
|
"qyxzname": "重点排污单位",
|
|
|
|
"id": 17534,
|
|
|
|
"stime": null,
|
|
|
|
"etime": null,
|
|
|
|
"sfhmd": 0,
|
|
|
|
"ljpf": 89,
|
|
|
|
"qybz": 1,
|
|
|
|
"sfcp": null,
|
|
|
|
"hstime": null,
|
|
|
|
"hetime": null,
|
|
|
|
"gstime": "1599548496797",
|
|
|
|
"zq_stime": null,
|
|
|
|
"zq_etime": null,
|
|
|
|
"sfww": null,
|
|
|
|
"hmdtime": null,
|
|
|
|
"qybzname": null,
|
|
|
|
"ljpfs": null,
|
|
|
|
"xqcount": 0
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"http_code": 200,
|
|
|
|
"error_msg": "",
|
|
|
|
"task_result": 1000,
|
|
|
|
"data_type": "list",
|
|
|
|
"spider_start_time": "2021-10-30 10:16:49.339",
|
|
|
|
"spider_end_time": "2021-10-30 10:16:54.067",
|
|
|
|
"task_params": {"province": "hebei"},
|
|
|
|
"metadata": {"index": "36"},
|
|
|
|
"spider_name": "environmental_protection",
|
|
|
|
"spider_ip": "10.8.1.38"
|
|
|
|
}
|
|
|
|
```
|
|
|
|
|
|
|
|
#### 辽宁:
|
|
|
|
```json
|
|
|
|
{
|
|
|
|
"data":
|
|
|
|
{
|
|
|
|
"pageNum": 22,
|
|
|
|
"pageSize": 50,
|
|
|
|
"size": 50,
|
|
|
|
"startRow": 1051,
|
|
|
|
"endRow": 1100,
|
|
|
|
"total": 3988,
|
|
|
|
"pages": 80,
|
|
|
|
"list":
|
|
|
|
[
|
|
|
|
{
|
|
|
|
"year": 2019, # 年份
|
|
|
|
"enterpriseName": "大连金海润废物综合利用有限公司", # 企业名称
|
|
|
|
"city": "大连市", # 所在城市
|
|
|
|
"area": "金普新区", # 所在区县
|
|
|
|
"enterpriseCode": "91210213MA0YT5EY7C", # 统一社会信用代码
|
|
|
|
"industry": "危险废物治理", # 行业类别
|
|
|
|
"pollutionGroup": "重点排污单位", # 污染源企业分类
|
|
|
|
"scoreNum": 11, # 参照字段解析
|
|
|
|
"isStop": "0" # 参照字段解析
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"year": 2019,
|
|
|
|
"enterpriseName": "大连金州第三水泥厂",
|
|
|
|
"city": "大连市",
|
|
|
|
"area": "金普新区",
|
|
|
|
"enterpriseCode": "91210213242935001B",
|
|
|
|
"industry": "水泥、石灰和石膏制造",
|
|
|
|
"pollutionGroup": "非重点排污单位",
|
|
|
|
"scoreNum": 11,
|
|
|
|
"isStop": "0"
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"prePage": 21,
|
|
|
|
"nextPage": 23,
|
|
|
|
"isFirstPage": false,
|
|
|
|
"isLastPage": false,
|
|
|
|
"hasPreviousPage": true,
|
|
|
|
"hasNextPage": true,
|
|
|
|
"navigatePages": 8,
|
|
|
|
"navigatepageNums": [18,19,20,21,22,23,24,25],
|
|
|
|
"navigateFirstPage": 18,
|
|
|
|
"navigateLastPage": 25,
|
|
|
|
"firstPage": 18,
|
|
|
|
"lastPage": 25
|
|
|
|
},
|
|
|
|
"http_code": 200,
|
|
|
|
"error_msg": "",
|
|
|
|
"task_result": 1000,
|
|
|
|
"data_type": "detail",
|
|
|
|
"spider_start_time": "2021-10-28 18:08:56.553",
|
|
|
|
"spider_end_time": "2021-10-28 18:09:28.191",
|
|
|
|
"task_params": {"province": "liaoning"},
|
|
|
|
"metadata": {"index": "22"},
|
|
|
|
"spider_name": "environmental_protection",
|
|
|
|
"spider_ip": "10.8.6.51"
|
|
|
|
}
|
|
|
|
```
|
|
|
|
> [字段解析](data_stream/environmental_protection_related/field)
|
|
|
|
|
|
|
|
|
|
## 爬虫运行环境
|
|
## 爬虫运行环境
|
|
<!--udm模块?scrapy?或其他-->
|
|
<!--udm模块?scrapy?或其他-->
|
... | @@ -769,7 +905,7 @@ scrapy |
... | @@ -769,7 +905,7 @@ scrapy |
|
```buildoutcfg
|
|
```buildoutcfg
|
|
target: node_51,
|
|
target: node_51,
|
|
spider_name: environmental_protection
|
|
spider_name: environmental_protection
|
|
5个进程
|
|
3个进程
|
|
```
|
|
```
|
|
|
|
|
|
|
|
|
... | | ... | |