Hadoop配置与应用(二)

2018-06-27 大数据 PV:

实践6 – HBase Shell基本命令

HBase搭建和配置

下载并解压
需先配置好zookeeper

配置系统环境变量

vi /etc/profile
增加
export HBASE_HOME=/path/of/your/hbase-1.2.0-cdh5.7.0
export PATH=$HBASE_HOME/bin:$PATH

配置HBase

cd $HBASE_HOME/conf
vi hbase-site.xml
增加
<property>
    <name>hbase.rootdir</name>
    <value>hdfs://bigdata:8020/hbase</value>
</property>
<property>
    <name>hbase.cluster.distributed</name>
    <value>true</value>
</property>
<property>
    <name>hbase.zookeeper.quorum</name>
    <value>bigdata:2181</value>
</property>

修改env配置文件

vi hbase-env.sh
修改
export JAVA_HOME=/usr/local/java/jdk1.8
export HBASE_MANAGES_ZK=false

启动hive:

# 先启动zookeeper
$ZK_HOME/bin/zkServer.sh start

# 启动hbase
$HBASE_HOME/bin/start-hbase.sh
# 可通过60010端口访问hbase的webui

# 通过hbase-shell访问hbase
$HBASE_HOME/bin/hbase shell

DDL

创建表

# 创建一个表
create 'member','member_id','address','info'

# 删除一个列族
alter 'member',{NAME=>'member_id',METHOD=>'delete'}
describe 'member'

查询表信息
1
describe 'member
列出所有表
list
删除表
1
2
disable 'member'
drop 'member'
DML

添加记录

put'member','alex','info:age','24'
put'member','alex','info:birthday','1987-06-17'
put'member','alex','info:company','alibaba'
put'member','alex','address:contry','china'
put'member','alex','address:province','zhejiang'
put'member','alex','address:city','hangzhou'
put'member','eric','info:birthday','1987-4-17'
put'member','eric','info:favorite','movie' 
put'member','eric','info:company','alibaba'
put'member','eric','address:contry','china'
put'member','eric','address:province','guangdong'
put'member','eric','address:city','guangzhou'
put'member','eric','address:town','biaoyun'

查询所有记录
scan 'member'

查询记录

# 获取一条数据
# 获取row-key下面所有数据 
get 'member','alex'

# 获取row-key下面一个列族中一个列的所有数据
get 'member','alex','info'

# 通过timestamp来获取两个版本的数据
get 'member','alex',{COLUMN=>'info:age',TIMESTAMP=>时间戳}

更新记录
1
put 'member','alex','info:age' ,'99'

删除记录

# 删除row-key为alex的值的‘info:age’字段
delete 'member','alex','info:age'

# 删除整行
 deleteall 'member','eric'

其他常用命令

统计行数
count 'member'
清空表
truncate 'member'

实践7 – Hive Shell基本命令

hive搭建和配置
下载并解压
安装mysql数据库

配置系统环境变量

vi /etc/profile
增加
export HIVE_HOME=/path/of/your/hive-1.1.0-cdh5.7.0
export PATH=$HIVE_HOME/bin:$PATH
配置hive
cd $HIVE_HOME/conf
vi hive-site.xml
增加
<property>
    <name>javax.jdo.option.ConnectionURL</name>
    <value>jdbc:mysql://localhost:3306/sparksql?createDatabaseIfNotExist=true</value>
</property>

<property>
    <name>javax.jdo.option.ConnectionDriverName</name>
    <value>com.mysql.jdbc.Driver</value>
</property>

<property>
    <name>javax.jdo.option.ConnectionUserName</name>
    <value>root</value>
</property>

<property>
    <name>javax.jdo.option.ConnectionPassword</name>
    <value>root</value>
</property>

拷贝mysql驱动到$HIVE_HOME/lib/
启动hive: $HIVE_HOME/bin/hive

hive实现word count单词统计

创建一张内部表

1
2
3

create table wc(
line string)
row format delimited fields terminated by '\n';

导入数据

1 2	# 本地数据 load data local inpath '/root/data/wordcount.txt' overwrite into table wc;

也可以先将wordcount.txt上传到hdfs，再导入数据。

1	hadoop fs -put /root/data/wordcount.txt /wordcount.txt

之后在hive执行

1 2	# hdfs上数据 load data inpath '/wordcount.txt' overwrite into table wc;

统计

# 查看内部函数
show functions;

# 统计词频
select word, count(1) as word_count 
from 
(select explode(split(line1, " ")) as word 
from 
(select regexp_replace(line, '[^a-zA-Z0-9]', ' ') as line1 
from wc
) wc1
) wc2 
where word regexp '[a-zA-Z0-9]'
group by word 
order by word_count desc 
limit 10;

hive基本使用

部分数据样式
weblog_entries.txt

e7d3f242f111c1b522137481d8508ab7 /ckyhatbpxu.html 2012-05-10 21:11:20 4.175.198.160
b8bd62a5c4ede37b9e77893e043fc1 /rr.html 2012-05-10 21:32:08 24.146.153.181

ip_to_country.txt

24.146.153.181 United States
203.156.137.68 Thailand

创建表

drop table if exists weblog_entries;

# 创建外部表，可保障删表时不删数据
create external table weblog_entries(
sid string,
url string,
req_date string,
req_time string,
ip string)
row format delimited fields terminated by '\t'
lines terminated by '\n';

create table ip_to_country(
ip string,
country string)
row format delimited fields terminated by '\t'
lines terminated by '\n';

导入数据

1 2	load data local inpath '/root/data/weblog_entries.txt' into table weblog_entries; load data local inpath '/root/data/ip_to_country.txt' into table ip_to_country;

统计访问本网站的用户主要来自于哪些国家

select itc.country, count(1) as c_count
from weblog_entries wle join ip_to_country itc on wle.ip = itc.ip
group by itc.country
order by c_count desc;

统计最受欢迎的栏目

select url, count(1) as url_count
from weblog_entries
group by url
order by url_count desc
limit 5;

统计21点10分到21点20分之间的流量

select count(*) as total
from weblog_entries
where hour(req_time) = 21 
and minute(req_time) >=10 and minute(req_time) <=20;

清洗数据展示

# 合并时间和日期
select concat_ws('_', req_date, req_time) 
from weblog_entries;

# 把id不足32位的数据去掉
create view weblog_sid32_view as
select *
from weblog_entries
where length(sid) >=32;

select * from weblog_sid32_view;

实践6 – HBase Shell基本命令

实践7 – Hive Shell基本命令