实践6 – HBase Shell基本命令
HBase搭建和配置
下载并解压
需先配置好zookeeper
配置系统环境变量
1
2
3
4vi /etc/profile
增加
export HBASE_HOME=/path/of/your/hbase-1.2.0-cdh5.7.0
export PATH=$HBASE_HOME/bin:$PATH配置HBase
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15cd $HBASE_HOME/conf
vi hbase-site.xml
增加
<property>
<name>hbase.rootdir</name>
<value>hdfs://bigdata:8020/hbase</value>
</property>
<property>
<name>hbase.cluster.distributed</name>
<value>true</value>
</property>
<property>
<name>hbase.zookeeper.quorum</name>
<value>bigdata:2181</value>
</property>修改env配置文件
1
2
3
4vi hbase-env.sh
修改
export JAVA_HOME=/usr/local/java/jdk1.8
export HBASE_MANAGES_ZK=false启动hive:
1
2
3
4
5
6
7
8
9# 先启动zookeeper
$ZK_HOME/bin/zkServer.sh start
# 启动hbase
$HBASE_HOME/bin/start-hbase.sh
# 可通过60010端口访问hbase的webui
# 通过hbase-shell访问hbase
$HBASE_HOME/bin/hbase shellDDL
创建表
1
2
3
4
5
6# 创建一个表
create 'member','member_id','address','info'
# 删除一个列族
alter 'member',{NAME=>'member_id',METHOD=>'delete'}
describe 'member'查询表信息
1
describe 'member
列出所有表
list
删除表
1
2disable 'member'
drop 'member'DML
添加记录
1
2
3
4
5
6
7
8
9
10
11
12
13put'member','alex','info:age','24'
put'member','alex','info:birthday','1987-06-17'
put'member','alex','info:company','alibaba'
put'member','alex','address:contry','china'
put'member','alex','address:province','zhejiang'
put'member','alex','address:city','hangzhou'
put'member','eric','info:birthday','1987-4-17'
put'member','eric','info:favorite','movie'
put'member','eric','info:company','alibaba'
put'member','eric','address:contry','china'
put'member','eric','address:province','guangdong'
put'member','eric','address:city','guangzhou'
put'member','eric','address:town','biaoyun'查询所有记录
scan 'member'
查询记录
1
2
3
4
5
6
7
8
9# 获取一条数据
# 获取row-key下面所有数据
get 'member','alex'
# 获取row-key下面一个列族中一个列的所有数据
get 'member','alex','info'
# 通过timestamp来获取两个版本的数据
get 'member','alex',{COLUMN=>'info:age',TIMESTAMP=>时间戳}更新记录
1
put 'member','alex','info:age' ,'99'
删除记录
1
2
3
4
5# 删除row-key为alex的值的‘info:age’字段
delete 'member','alex','info:age'
# 删除整行
deleteall 'member','eric'其他常用命令
统计行数
count 'member'
清空表
truncate 'member'
实践7 – Hive Shell基本命令
hive搭建和配置
下载并解压
安装mysql数据库
配置系统环境变量
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27vi /etc/profile
增加
export HIVE_HOME=/path/of/your/hive-1.1.0-cdh5.7.0
export PATH=$HIVE_HOME/bin:$PATH
配置hive
cd $HIVE_HOME/conf
vi hive-site.xml
增加
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://localhost:3306/sparksql?createDatabaseIfNotExist=true</value>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.jdbc.Driver</value>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>root</value>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>root</value>
</property>拷贝mysql驱动到$HIVE_HOME/lib/
启动hive: $HIVE_HOME/bin/hive
hive实现word count单词统计
创建一张内部表
1
2
3create table wc(
line string)
row format delimited fields terminated by '\n';导入数据
1
2# 本地数据
load data local inpath '/root/data/wordcount.txt' overwrite into table wc;也可以先将wordcount.txt上传到hdfs,再导入数据。
1
hadoop fs -put /root/data/wordcount.txt /wordcount.txt
之后在hive执行
1
2# hdfs上数据
load data inpath '/wordcount.txt' overwrite into table wc;统计
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16# 查看内部函数
show functions;
# 统计词频
select word, count(1) as word_count
from
(select explode(split(line1, " ")) as word
from
(select regexp_replace(line, '[^a-zA-Z0-9]', ' ') as line1
from wc
) wc1
) wc2
where word regexp '[a-zA-Z0-9]'
group by word
order by word_count desc
limit 10;hive基本使用
部分数据样式
weblog_entries.txt
e7d3f242f111c1b522137481d8508ab7 /ckyhatbpxu.html 2012-05-10 21:11:20 4.175.198.160
b8bd62a5c4ede37b9e77893e043fc1 /rr.html 2012-05-10 21:32:08 24.146.153.181
ip_to_country.txt
24.146.153.181 United States
203.156.137.68 Thailand
- 创建表
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17drop table if exists weblog_entries;
# 创建外部表,可保障删表时不删数据
create external table weblog_entries(
sid string,
url string,
req_date string,
req_time string,
ip string)
row format delimited fields terminated by '\t'
lines terminated by '\n';
create table ip_to_country(
ip string,
country string)
row format delimited fields terminated by '\t'
lines terminated by '\n'; - 导入数据
1
2load data local inpath '/root/data/weblog_entries.txt' into table weblog_entries;
load data local inpath '/root/data/ip_to_country.txt' into table ip_to_country; - 统计访问本网站的用户主要来自于哪些国家
1
2
3
4select itc.country, count(1) as c_count
from weblog_entries wle join ip_to_country itc on wle.ip = itc.ip
group by itc.country
order by c_count desc; - 统计最受欢迎的栏目
1
2
3
4
5select url, count(1) as url_count
from weblog_entries
group by url
order by url_count desc
limit 5; - 统计21点10分到21点20分之间的流量
1
2
3
4select count(*) as total
from weblog_entries
where hour(req_time) = 21
and minute(req_time) >=10 and minute(req_time) <=20; - 清洗数据展示
1
2
3
4
5
6
7
8
9
10
11# 合并时间和日期
select concat_ws('_', req_date, req_time)
from weblog_entries;
# 把id不足32位的数据去掉
create view weblog_sid32_view as
select *
from weblog_entries
where length(sid) >=32;
select * from weblog_sid32_view;