Hive DML
DML 全称 Data Manipulation Language,即数据操作语言。包括 load、insert、update、delete、merge、import、export、explain plan 等。
# 加载文件到表
语法:
LOAD DATA [LOCAL] INPATH 'filepath' [OVERWRITE] INTO TABLE tablename [PARTITION (partcol1=val1, partcol2=val2 ...)]
LOAD DATA [LOCAL] INPATH 'filepath' [OVERWRITE] INTO TABLE tablename [PARTITION (partcol1=val1, partcol2=val2 ...)] [INPUTFORMAT 'inputformat' SERDE 'serde'] (3.0 or later)
1
2
3
2
3
例:
load data local inpath '/home/bigdata/student.txt' into table student;
1
# 查询结果插入表
-- 标准写法
INSERT OVERWRITE TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...) [IF NOT EXISTS]] select_statement1 FROM from_statement;
INSERT INTO TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...)] select_statement1 FROM from_statement;
-- 多次插入
FROM from_statement
INSERT OVERWRITE TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...) [IF NOT EXISTS]] select_statement1
[INSERT OVERWRITE TABLE tablename2 [PARTITION ... [IF NOT EXISTS]] select_statement2]
[INSERT INTO TABLE tablename2 [PARTITION ...] select_statement2] ...;
FROM from_statement
INSERT INTO TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...)] select_statement1
[INSERT INTO TABLE tablename2 [PARTITION ...] select_statement2]
[INSERT OVERWRITE TABLE tablename2 [PARTITION ... [IF NOT EXISTS]] select_statement2] ...;
-- 动态分区插入
INSERT OVERWRITE TABLE tablename PARTITION (partcol1[=val1], partcol2[=val2] ...) select_statement FROM from_statement;
INSERT INTO TABLE tablename PARTITION (partcol1[=val1], partcol2[=val2] ...) select_statement FROM from_statement;
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
例:
-- 标准写法
insert overwrite table student2 select id, name from student;
-- 多次插入
from student
insert overwrite table student partition(month='202201')
select id, name where month='201709'
insert overwrite table student partition(month='202202')
select id, name where month='201709';
-- 动态分区
insert into student2 partition(month) select * from student;
1
2
3
4
5
6
7
8
9
10
11
12
2
3
4
5
6
7
8
9
10
11
12
# 查询结果写入文件
语法:
-- 标准写法
INSERT OVERWRITE [LOCAL] DIRECTORY directory1
[ROW FORMAT row_format] [STORED AS file_format] (Note: Only available starting with Hive 0.11.0)
SELECT ... FROM ...
-- 多次插入
FROM from_statement
INSERT OVERWRITE [LOCAL] DIRECTORY directory1 select_statement1
[INSERT OVERWRITE [LOCAL] DIRECTORY directory2 select_statement2] ...
row_format
: DELIMITED [FIELDS TERMINATED BY char [ESCAPED BY char]] [COLLECTION ITEMS TERMINATED BY char]
[MAP KEYS TERMINATED BY char] [LINES TERMINATED BY char]
[NULL DEFINED AS char] (Note: Only available starting with Hive 0.13)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
2
3
4
5
6
7
8
9
10
11
12
13
14
例:
insert overwrite local directory '/home/bigdata/export/student' select * from student;
1
# SQL 插入数据
语法:
INSERT INTO TABLE tablename [PARTITION (partcol1[=val1], partcol2[=val2] ...)] VALUES values_row [, values_row ...]
Where values_row is:
( value [, value ...] )
where a value is either null or any valid SQL literal
1
2
3
4
5
2
3
4
5
例:
create table students (name varchar(64), age int, gpa decimal(3, 2))
clustered by (age) into 2 buckets stored as orc;
insert into table students values ('fred flintstone', 35, 1.28), ('barney rubble', 32, 2.32);
1
2
3
4
2
3
4
create table pageviews (userid varchar(64), link string, came_from string)
partition by (datestamp STRING) clustered by (userid) into 256 buckets stored as orc;
insert into table pageviews partition (datestamp = '2014-09-23')
values ('jsmith', 'mail.com', 'sports.com'), ('jdoe', 'mail.com', null);
insert into table pageviews partition (datestamp)
values ('tjohnson', 'sports.com', 'finance.com', '2014-09-23'), ('tlee', 'finance.com', null, '2014-09-21');
insert into table pageviews
values ('tjohnson', 'sports.com', 'finance.com', '2014-09-23'), ('tlee', 'finance.com', null, '2014-09-21');
1
2
3
4
5
6
7
8
9
10
11
2
3
4
5
6
7
8
9
10
11
# 更新
提示
更新只能在支持 ACID 的表上执行,参阅 Hive事务。
语法:
UPDATE tablename SET column = value [, column = value ...] [WHERE expression]
1
# 删除
提示
删除只能在支持 ACID 的表上执行,参阅 Hive事务。
语法:
DELETE FROM tablename [WHERE expression]
1
# Merge
提示
Merge 是从 Hive 2.2 版本开始引入的,且只能在支持 ACID 的表上执行,参阅 Hive事务。
Merge 语法使用源表数据批量目标表的数据,换句话说就是在一条语句里实现更新、删除和插入操作。使用该功能还需做如下配置:
set hive.support.concurrency = true;
set hive.enforce.bucketing = true;
set hive.exec.dynamic.partition.mode = nonstrict;
set hive.txn.manager = org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;
set hive.compactor.initiator.on = true;
set hive.compactor.worker.threads = 1;
set hive.auto.convert.join=false;
set hive.merge.cardinality.check=false;
1
2
3
4
5
6
7
8
2
3
4
5
6
7
8
语法:
MERGE INTO <target table> AS T USING <source expression/table> AS S
ON <boolean expression1>
WHEN MATCHED [AND <boolean expression2>] THEN UPDATE SET <set clause list>
WHEN MATCHED [AND <boolean expression3>] THEN DELETE
WHEN NOT MATCHED [AND <boolean expression4>] THEN INSERT VALUES<value list>
1
2
3
4
5
2
3
4
5
注意
虽然 Hive 支持了 update、delete 甚至 merge 操作,但是使用起来还是有很多约束,比较繁琐。Hive 的定位就是数据仓库工具,不是传统的 RDBMS,所以查询分析操作才是我们应该关注的,实际上在企业中 update、delete、merge 操作并不常用。
# Export/Import
export 命令将表或分区的数据连同元数据一起导出到指定的输出位置。然后可以将此输出位置移动到不同的 Hadoop 或 Hive 实例,并使用 import 命令再次导入。
Export 语法:
EXPORT TABLE tablename [PARTITION (part_column="value"[, ...])]
TO 'export_target_path' [ FOR replication('eventid') ]
1
2
2
Import 语法:
IMPORT [[EXTERNAL] TABLE new_or_original_tablename [PARTITION (part_column="value"[, ...])]]
FROM 'source_path'
[LOCATION 'import_target_path']
1
2
3
2
3
例:
简单导入导出:
export table department to 'hdfs_exports_location/department';
import from 'hdfs_exports_location/department';
1
2
2
import 时重命名表:
export table department to 'hdfs_exports_location/department';
import table imported_dept from 'hdfs_exports_location/department';
1
2
2
导出指定分区并导入:
export table employee partition (emp_country="in", emp_state="ka") to 'hdfs_exports_location/employee';
import from 'hdfs_exports_location/employee';
1
2
2
导出表和导入分区:
export table employee to 'hdfs_exports_location/employee';
import table employee partition (emp_country="us", emp_state="tn") from 'hdfs_exports_location/employee';
1
2
2
指定导入位置:
export table department to 'hdfs_exports_location/department';
import table department from 'hdfs_exports_location/department'
location 'import_target_location/department';
1
2
3
2
3
作为外部表导入:
export table department to 'hdfs_exports_location/department';
import external table department from 'hdfs_exports_location/department';
1
2
2
上次更新: 2023/11/01, 03:11:44