create database veer;
use veer;
create table item_details(item_id int,item_name varchar(30),item_price int,item_color varchar(10)) row format delimited fields terminated by',' lines terminated by '\n' stored as textfile;
hadoop fs -put /home/cloudera/sk/data/item.txt /user/hive/warehouse/veer.db/item
select count(*) from item_details;
select * from item_details order by item_id desc;
select item_color,count(*) from item_details group by item_color;
---------------------------31-01-2018----------------------------------
create table flight_nt(s_id int,s_name varchar(30),code varchar(20),state varchar(40))
row format delimited fields terminated by',' lines terminated by '\n' stored as textfile;
create table flightdata_part
(s_id int,
s_name varchar(30),
code varchar(20)
)
PARTITIONED BY (state VARCHAR(64))
row format delimited
fields terminated by ','
lines terminated by '\n'
stored as textfile;
LOAD DATA LOCAL INPATH '/local/home/srv-ae-si01/RTCTM/arun_dev/interns_training/data/FlightBook.txt' INTO TABLE interns_training.flight_nt;
--- PARTITIONED--
CREATE TABLE IF NOT EXISTS flightdata_partition(
id int,
name varchar(20),
code varchar(10)
)
PARTITIONED BY (
year int,
month int,
day int)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
Lines terminated by '\n'
STORED AS TEXTFILE;
LOCATION '/home/cloudera/sk/data/FlightBook.txt';
SET hive.exec.dynamic.partition = true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions=1000;
set hive.exec.max.dynamic.partitions.pernode=1000;
SET hive.exec.compress.output=true;
set.hive.enforce.bucketing=true;
SET mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec;
SET mapred.output.compression.type=BLOCK;
--Dynamic PARTITIONED
insert into flight_partition partition(state) select * from interns_training.flight_nt;
--static partition.---
insert into flight_partition partition(state='Australia') select s_id,s_name,code from interns_training.flight_nt where state='Australia';
alter table flight_partition add partition(state='United_States')
---- Bucketing
create table flight_bucket
(
s_id int,
s_name varchar(30),
code varchar(20),
state varchar(30)
)
CLUSTERED BY (s_id) INTO 10 BUCKETS
row format delimited
fields terminated by ','
lines terminated by '\n'
stored as textfile;
set hive.enforce.bucketing = true;
insert into flight_bucket select * from interns_training.flight_nt;
---- ORC -------------------------
CREATE TABLE flight_orc
(s_id int,
s_name varchar(30),
code varchar(20),
state string
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS ORC;
insert into flight_orc select * from interns_training.flight_nt;
------------------------------PARTITIONED_BUCKET_ORC---------------------------
create table flight_parti_bucket_orc
(
s_id int,
s_name varchar(30),
code varchar(20)
)
partitioned by (state varchar(30))
CLUSTERED BY (s_id) INTO 10 BUCKETS
row format delimited
fields terminated by ','
lines terminated by '\n'
stored as orc;
insert into flight_parti_bucket_orc select * from flightdata;
-----------------------------------snappy--------------------
create table flight_parti_bucket_orc
(
s_id int,
s_name varchar(30),
code varchar(20)
)
partitioned by (state varchar(30))
CLUSTERED BY (s_id) INTO 10 BUCKETS
row format delimited
fields terminated by ','
lines terminated by '\n'
stored as orc
tblproperties (“orc.compress" = “SNAPPY”);
------------------------------------------------------------------------------
create table flight_partition_bucket(s_id int,s_name varchar(30),code varchar(20))
PARTITIONED BY (state VARCHAR(64)) CLUSTERED BY (s_id) INTO 10 BUCKETS row format delimited fields terminated by ',' lines terminated by
'\n' stored as textfile;
insert into flight_partition partition(state) select * from veer.flight_nt limit 1000;
select A.state,B.state from flight_nt A join flight_orc B on A.state=B.state limit 100;
-----------------------------------------------------------------------------------------
set hive.enforce.sortmergebucketmapjoin=false;
set hive.auto.convert.sortmerge.join=true;
set hive.optimize.bucketmapjoin = true;
set hive.optimize.bucketmapjoin.sortedmerge = true;
set hive.auto.convert.join=false; // if we do not do this, automatically Map-Side Join will happen
-------------------------------------------------1/2/2018--------------------------------------
create table flight_parti_bucket_orc
(
s_id int,
s_name varchar(30),
code varchar(20)
)
partitioned by (state varchar(30))
CLUSTERED BY (s_id) INTO 10 BUCKETS
row format delimited
fields terminated by ','
lines terminated by '\n'
stored as orc;
-------------------------------left semi join--------------
select f.s_id from flight_nt f left semi join flight_join j on f.s_id=j.id;
--------------------------map side join------------------
select f.s_id,j.id from flight_nt f join flight_join j on f.s_id=j.id;
Time taken: 74.647 seconds,
set hive.auto.convert.join=true;
set hive.auto.convert.join.noconditionaltask=true;
------errror-------------
select f.s_id,j.id from flight_nt f join flight_join j on (f.s_id=j.id);
Query ID = cloudera_20180201031919_307c2d2e-eb9f-4789-b777-91f1506b11d5
Total jobs = 1
FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.mr.MapredLocalTask
-----------Sort-Merge-Bucket (SMB) Map Join------------
set hive.enforce.bucketing=true;
set hive.enforce.sorting=true;
---------------skewed join------------
set hive.optimize.skewjoin = true;
set hive.skewjoin.key=500000;
set hive.skewjoin.mapjoin.map.tasks=10000;
set hive.skewjoin.mapjoin.min.split=33554432;
use veer;
create table item_details(item_id int,item_name varchar(30),item_price int,item_color varchar(10)) row format delimited fields terminated by',' lines terminated by '\n' stored as textfile;
hadoop fs -put /home/cloudera/sk/data/item.txt /user/hive/warehouse/veer.db/item
select count(*) from item_details;
select * from item_details order by item_id desc;
select item_color,count(*) from item_details group by item_color;
---------------------------31-01-2018----------------------------------
create table flight_nt(s_id int,s_name varchar(30),code varchar(20),state varchar(40))
row format delimited fields terminated by',' lines terminated by '\n' stored as textfile;
create table flightdata_part
(s_id int,
s_name varchar(30),
code varchar(20)
)
PARTITIONED BY (state VARCHAR(64))
row format delimited
fields terminated by ','
lines terminated by '\n'
stored as textfile;
LOAD DATA LOCAL INPATH '/local/home/srv-ae-si01/RTCTM/arun_dev/interns_training/data/FlightBook.txt' INTO TABLE interns_training.flight_nt;
--- PARTITIONED--
CREATE TABLE IF NOT EXISTS flightdata_partition(
id int,
name varchar(20),
code varchar(10)
)
PARTITIONED BY (
year int,
month int,
day int)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
Lines terminated by '\n'
STORED AS TEXTFILE;
LOCATION '/home/cloudera/sk/data/FlightBook.txt';
SET hive.exec.dynamic.partition = true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions=1000;
set hive.exec.max.dynamic.partitions.pernode=1000;
SET hive.exec.compress.output=true;
set.hive.enforce.bucketing=true;
SET mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec;
SET mapred.output.compression.type=BLOCK;
--Dynamic PARTITIONED
insert into flight_partition partition(state) select * from interns_training.flight_nt;
--static partition.---
insert into flight_partition partition(state='Australia') select s_id,s_name,code from interns_training.flight_nt where state='Australia';
alter table flight_partition add partition(state='United_States')
---- Bucketing
create table flight_bucket
(
s_id int,
s_name varchar(30),
code varchar(20),
state varchar(30)
)
CLUSTERED BY (s_id) INTO 10 BUCKETS
row format delimited
fields terminated by ','
lines terminated by '\n'
stored as textfile;
set hive.enforce.bucketing = true;
insert into flight_bucket select * from interns_training.flight_nt;
---- ORC -------------------------
CREATE TABLE flight_orc
(s_id int,
s_name varchar(30),
code varchar(20),
state string
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS ORC;
insert into flight_orc select * from interns_training.flight_nt;
------------------------------PARTITIONED_BUCKET_ORC---------------------------
create table flight_parti_bucket_orc
(
s_id int,
s_name varchar(30),
code varchar(20)
)
partitioned by (state varchar(30))
CLUSTERED BY (s_id) INTO 10 BUCKETS
row format delimited
fields terminated by ','
lines terminated by '\n'
stored as orc;
insert into flight_parti_bucket_orc select * from flightdata;
-----------------------------------snappy--------------------
create table flight_parti_bucket_orc
(
s_id int,
s_name varchar(30),
code varchar(20)
)
partitioned by (state varchar(30))
CLUSTERED BY (s_id) INTO 10 BUCKETS
row format delimited
fields terminated by ','
lines terminated by '\n'
stored as orc
tblproperties (“orc.compress" = “SNAPPY”);
------------------------------------------------------------------------------
create table flight_partition_bucket(s_id int,s_name varchar(30),code varchar(20))
PARTITIONED BY (state VARCHAR(64)) CLUSTERED BY (s_id) INTO 10 BUCKETS row format delimited fields terminated by ',' lines terminated by
'\n' stored as textfile;
insert into flight_partition partition(state) select * from veer.flight_nt limit 1000;
select A.state,B.state from flight_nt A join flight_orc B on A.state=B.state limit 100;
-----------------------------------------------------------------------------------------
set hive.enforce.sortmergebucketmapjoin=false;
set hive.auto.convert.sortmerge.join=true;
set hive.optimize.bucketmapjoin = true;
set hive.optimize.bucketmapjoin.sortedmerge = true;
set hive.auto.convert.join=false; // if we do not do this, automatically Map-Side Join will happen
-------------------------------------------------1/2/2018--------------------------------------
create table flight_parti_bucket_orc
(
s_id int,
s_name varchar(30),
code varchar(20)
)
partitioned by (state varchar(30))
CLUSTERED BY (s_id) INTO 10 BUCKETS
row format delimited
fields terminated by ','
lines terminated by '\n'
stored as orc;
-------------------------------left semi join--------------
select f.s_id from flight_nt f left semi join flight_join j on f.s_id=j.id;
--------------------------map side join------------------
select f.s_id,j.id from flight_nt f join flight_join j on f.s_id=j.id;
Time taken: 74.647 seconds,
set hive.auto.convert.join=true;
set hive.auto.convert.join.noconditionaltask=true;
------errror-------------
select f.s_id,j.id from flight_nt f join flight_join j on (f.s_id=j.id);
Query ID = cloudera_20180201031919_307c2d2e-eb9f-4789-b777-91f1506b11d5
Total jobs = 1
FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.mr.MapredLocalTask
-----------Sort-Merge-Bucket (SMB) Map Join------------
set hive.enforce.bucketing=true;
set hive.enforce.sorting=true;
---------------skewed join------------
set hive.optimize.skewjoin = true;
set hive.skewjoin.key=500000;
set hive.skewjoin.mapjoin.map.tasks=10000;
set hive.skewjoin.mapjoin.min.split=33554432;
Comments
Post a Comment