配置信息:
使用gdpi.sada_gdpi_click库时必须加上这段权限
set /user/gdpi/public/sada_gdpi_click.password=GWDPI-SH;
解决分片数过多问题
set mapreduce.job.max.split.locations = 10000;
配置reduce数量
SET mapred.reduce.tasks = 80;
配置使用内存
SET mapreduce.map.memory.mb=3048;
SET mapreduce.reduce.memory.mb=7000;
配置Java虚拟机内存
SET mapreduce.map.java.opts='-Xmx6096m';
SET mapreduce.reduce.java.opts='-Xmx10000m';
配置mapjoin
SET hive.auto.convert.join=false;
SET hive.ignore.mapjoin.hint=false;
配置是否使用压缩
SET hive.exec.compress.output=true;
配置压缩格式(参考HIVE配置)
SET mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec;
加载自定义UDF
ADD JAR ${udf_path};
配置自定义UDF函数(参考自定义UDF库)
CREATE TEMPORARY FUNCTION my_dev AS 'udf.tag.ua.DeviceTypeUDF';
hive内置的udf参见:<https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF>
基本操作命令:
进入hive环境
hive;
创建数据仓库
create database e_pingan WITH DBPROPERTIES ('creator' = 'yangbl', 'date' = '20150430');
查看数据仓库
SHOW DATABASES;
进入数据库
use e_pingan;
查看数据表
show tables;
生成可用数据表
add jar /home/xuji/yang/hive/udf.jar;
create temporary function my_decode as 'udf.FullDecodeUDF';
create temporary function my_dev as 'udf.DeviceUDF';
create table zhangwj.touna_used
row format delimited fields terminated by '\t'
location '/user/xuji/yang/test/touna'
as
select srcip,datelabel,loadstamp,my_decode(url) as url,my_decode(cast(unbase64(ref) as string)) as ref,
case
when cast(unbase64(ref) as string) like '%.baidu.%/%wd=%' then my_decode(parse_url(cast(unbase64(ref) as string), 'QUERY', 'wd'))
when cast(unbase64(ref) as string) like '%.baidu.%/%word=%' then my_decode(parse_url(cast(unbase64(ref) as string), 'QUERY', 'word'))
when cast(unbase64(ref) as string) like '%.baidu.%/%kw=%' then my_decode(parse_url(cast(unbase64(ref) as string), 'QUERY', 'kw'))
when cast(unbase64(ref) as string) like '%.haosou.%/%q=%' then my_decode(parse_url(cast(unbase64(ref) as string), 'QUERY', 'q'))
when cast(unbase64(ref) as string) like '%.sogou.%/%query=%' then my_decode(parse_url(cast(unbase64(ref) as string), 'QUERY', 'query'))
when cast(unbase64(ref) as string) like '%.bing.%/%q=%' then my_decode(parse_url(cast(unbase64(ref) as string), 'QUERY', 'q'))
when cast(unbase64(ref) as string) like '%.google.%/%q=%' then my_decode(parse_url(cast(unbase64(ref) as string), 'QUERY', 'q'))
else null end as kw,
my_dev(cast(unbase64(ua) as string)) as ua
from touna;
查询结果保存到本地
insert overwrite local directory '/home/xuji/zhangwj/task' row format delimited fields terminated by '\t'
select search_keyword,sum(pv) as kpv from yangbl.pingan_data_kw_20150302 group by search_keyword order by kpv desc limit 20000;
hive脚本的使用
参见具体的脚本
shell脚本的使用
参见具体脚本
mr程序的编写和运行
参见具体程序