CHEATSHEET January 20, 2022

hdfs

Words count 84k Reading time 1:17 Read count 0

shell直接写文件

1
2
dfs dfs -appendToFile - HDFSfile
# 按ctrl+C结束写入

启停命令

1
2
3
4
5
6
7
8
9

$HADOOP_HOME/sbin/hadoop-daemon.sh start namenode $HADOOP_HOME/sbin/hadoop-daemon.sh stop namenode
$HADOOP_HOME/sbin/hadoop-daemon.sh start datanode $HADOOP_HOME/sbin/hadoop-daemon.sh stop datanode
$HADOOP_HOME/sbin/hadoop-daemon.sh start journalnode $HADOOP_HOME/sbin/hadoop-daemon.sh stop journalnode
$HADOOP_HOME/sbin/hadoop-daemon.sh start zkfc $HADOOP_HOME/sbin/hadoop-daemon.sh stop zkfc
$HADOOP_HOME/sbin/yarn-daemon.sh start resourcemanager $HADOOP_HOME/sbin/yarn-daemon.sh stop resourcemanager
$HADOOP_HOME/sbin/yarn-daemon.sh start nodemanager $HADOOP_HOME/sbin/yarn-daemon.sh stop nodemanager
$HADOOP_HOME/sbin/mr-jobhistory-daemon.sh start historyserver $HADOOP_HOME/sbin/mr-jobhistory-daemon.sh stop historyserver

部署3.x

1
2
export JAVA_HOME=/opt/soft/jdk
export HADOOP_PID_DIR=${HADOOP_HOME}
1
2
3
4
<property>
<name>fs.defaultFS</name>
<value>hdfs://localhost:9000</value>
</property>
1
2
3
4
5
6
7
8
9
10
11
12
<property>
<name>dfs.namenode.name.dir</name>
<value>/opt/soft/data/hdfs/name</value>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>/opt/soft/data/hdfs/data</value>
</property>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
1
2
3
4
5
6
7
8
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.nodemanager.env-whitelist</name>
<value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_HOME,PATH,LANG,TZ,HADOOP_MAPRED_HOME</value>
</property>
1
2
3
4
5
6
7
8
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<property>
<name>mapreduce.application.classpath</name>
<value>$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*:$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*</value>
</property>
1
2
bin/hdfs namenode -format
sbin/start-dfs.sh

文件保护删除

hdfs删除逻辑为,路径上所有需要删除的文件节点的父目录有w权限

即:即使A用户是根目录的所有者,只要A创建/允许创建了B用户的目录且该目录下有数据,只要A不是超级用户则不能删除。注意若该目录下无文件,则该目录依然能被删除。

操作方式

启动:使用hdfs用户启动hadoop服务

任务部署:ds, 上传jar包、运行任务都将以hadoop运行,生产是数据hadoop写出为755

个人调试/查看数据:可以本地执行hadoop/spark/flink命令执行

数据保护:定期修改关键数据为hdfs用户下755,只有hdfs可删除文件

hdfs dfs -chown hdfs xxx
#hdfs dfs -chmod -R 755 xxx

配置权限

hdfs-site.xml

1
2
3
4
5
6
7
8
<property>
<name>dfs.permissions.enabled</name>
<value>true</value>
</property>
<property>
<name>dfs.namenode.acls.enabled</name>
<value>true</value>
</property>

单个用户设置权限
增加ACL: hdfs dfs -setfacl -m user:jiangmanhua:rwx /hive/warehouse/label.db
递归增加ACL: hdfs dfs -setfacl -R -m user:hadoop:r-x /dir
删除ACL: hdfs dfs -setfacl -x user:hadoop /file
查询ACL: hdfs dfs -getfacl /hive/warehouse/label.db/
hdfs dfs -ls /hive/warehouse/label.db/

hadoop-policy.xml

yarn rmadmin -refreshServiceAcl
hadoop dfsadmin -refreshServiceAcl

基础开关
hadoop.security.authorization=true

控制提交任务/ JobTracker
security.job.client.protocol.acl

访问HDFS /NameNode
security.client.protocol.acl

yarn-site

开启yarn ACLs:
hadoop: core-site.xml
hadoop.security.authorization=true #开启服务级别验证,否则hadoop组件的acl设置不生效
yarn: yarn-site.xml

yarn.acl.enable
true


yarn.admin.acl
hdfs

$ vi $HADOOP_CONF_DIR/capacity-scheduler.xml
$ yarn rmadmin -refreshQueues

yarn.scheduler.capacity.root.<queue-path>.acl_submit_applications

跨集群复制

[Test case from new cluster]

1
2
hadoop distcp hdfs://ip:9000/hive/spark-jars/metrics-core-4.1.1.jar /
hadoop distcp /spark/lib/HikariCP-2.5.1.jar hdfs://ip:9000/

-update -skipcrccheck

Quota目录容量限制

hdfs dfs -count -q -h -v /hive/warehouse/tmp.db
目录个数,文件个数,文件总计大小

文件数量(目录下的文件和目录数)
hdfs dfsadmin -setQuota 1800000 /hive/warehouse/tmp.db
清除限制
hdfs dfsadmin -clrQuota /hive/warehouse/tmp.db

文件大小
hdfs dfsadmin -setSpaceQuota 12T /hive/warehouse/tmp.db
清除限制
hdfs dfsadmin -clrSpaceQuota /hive/warehouse/tmp.db

webhdfs使用

http://10.17.41.126:50070/explorer.html#/

user.name=hadoop

List

1
curl -i  "http://10.17.41.126:50070/webhdfs/v1/opendata?op=LISTSTATUS"

Create

1
curl -i -X PUT "http://10.17.41.126:50070/webhdfs/v1/opendata/testFile?op=CREATE&overwrite=false&replication=1"

返回datanode信息:
http://LGJF-ZYC6-HCY-SVR553.cmic:50075/webhdfs/v1/opendata/testFile?op=CREATE&namenoderpcaddress=LGJF-ZYC6-HCY-SVR553:9000&createflag=&createparent=true&overwrite=false&replication=1

1
curl -i -X PUT -T <LOCAL_FILE> "http://LGJF-ZYC6-HCY-SVR553.cmic:50075/webhdfs/v1/opendata/testFile?&op=CREATE&namenoderpcaddress=LGJF-ZYC6-HCY-SVR553:9000&createflag=&createparent=true&overwrite=false&replication=1"

脚本化可参考:https://github.com/pensz/hdfs_tools/tree/master/shell

控制容量 https://blog.csdn.net/weixin_30338481/article/details/94915052

机架感知

core-default.xml中增加配置

1
2
3
4
<property> 
<name>net.topology.script.file.name</name>
<value>/home/rack_topology.sh</value>
</property>

rack_topology.sh

1
2
3
4
5
6
7
8
9
10
11
12
TOPO=/mnt/d/topology.data

while [ $# -gt 0 ] ; do
nodeArg=$1
result=`awk -v var=${nodeArg} '$1 == var {print $2}' ${TOPO}`
shift
if [ -z "$result" ] ; then
echo -n "/default/rack "
else
echo -n "$result "
fi
done

topology.data

1
2
3
hadoopdata1.com     /dc1/rack1
hadoopdata1 /dc1/rack1
10.1.1.1 /dc1/rack2
1
2
3
4
5
6
7
8
import sys
rack = {dn178.tj":"rack1",
dn187.tj":"rack2"
"192.168.1.178":"rack1",
"192.168.1.187":"rack2",
}
if __name__=="__main__":
print "/" + rack.get(sys.argv[1],"rack0")

小文件合并

http://hadoop.apache.org/docs/current/hadoop-archives/HadoopArchives.html

短路读

https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/ShortCircuitLocalReads.html
https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/NativeLibraries.html

外置客户端配置withKerberos

可以通过多版本的conf来共存,

1
2
export HADOOP_CONF_DIR=/data/soft/hadoop-2.10.2-bdoc/etc/hadoop
export HADOOP_CLASSPATH=`hadoop classpath
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22

check: yarn app -list && hdfs dfs -ls .

core-site.xml
```xml
<configuration xmlns:xi="http://www.w3.org/2001/XInclude">
<!--需与目标集群一致,不得自定义名称-->
<property>
<name>fs.defaultFS</name>
<value>hdfs://ns1</value>
</property>
<!--kerberos认证环境-->
<property>
<name>hadoop.security.authentication</name>
<value>kerberos</value>
</property>
<!-- 用于支持同时访问kerberos和simple证环境 -->
<property>
<name>ipc.client.fallback-to-simple-auth-allowed</name>
<value>true</value>
</property>
</configuration>

hdfs-site.xml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
<configuration>

<property>
<name>dfs.nameservices</name>
<value>ns1,ns2</value>
</property>

<property>
<name>dfs.ha.namenodes.ns1</name>
<value>nn1,nn2</value>
</property>

<!- 需要设置为hostname,否则principal校验不通过 Server has invalid Kerberos principal: nn/b27-gz3-sjjcpt-js-009@GZHLWYJQ, expecting: nn/10.136.102.9@GZHLWYJQ
->
<property>
<name>dfs.namenode.rpc-address.ns1.nn1</name>
<value>h1:8020</value>
</property>

<property>
<name>dfs.namenode.rpc-address.ns1.nn2</name>
<value>h2:8020</value>
</property>

<property>
<name>dfs.client.failover.proxy.provider.ns1</name>
<value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
</property>

<property>
<name>dfs.namenode.kerberos.principal</name>
<value>nn/_HOST@xx</value>
</property>

<!- ns2部分重复相似->

</configuration>

yarn-site.xml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
<configuration>
<property>
<name>yarn.resourcemanager.ha.enabled</name>
<value>true</value>
</property>

<property>
<name>yarn.resourcemanager.ha.rm-ids</name>
<value>rm1,rm2</value>
</property>
<property>
<name>yarn.resourcemanager.hostname.rm1</name>
<value>h1-yy</value>
</property>

<property>
<name>yarn.resourcemanager.hostname.rm2</name>
<value>h2</value>
</property>
<property>
<name>yarn.client.failover-proxy-provider</name>
<value>org.apache.hadoop.yarn.client.ConfiguredRMFailoverProxyProvider</value>
</property>

<!- 解決spark on yarn任务可从4040跳转到yarn访问 。非必要->
<property>
<name>yarn.resourcemanager.webapp.address.rm1</name>
<value>h1:8088</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.address.rm2</name>
<value>h2:8088</value>
</property>
<property>
<name>yarn.http.policy</name>
<value>HTTP_ONLY</value>
</property>


<property>
<name>yarn.resourcemanager.principal</name>
<value>rm/_HOST@xx</value>
</property>
</configuration>

distcp
-Dmapreduce.job.hdfs-servers.token-renewal.exclude=cluster_2

mapred-site.xml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<property>
<name>mapreduce.job.queuename</name>
<value>root.xx</value>
</property>
<property>
<name>mapreduce.cluster.local.dir</name>
<value>/data/soft/hadoop/mrLocalTmp</value>
</property>
<property>
<name>mapreduce.application.classpath</name>
<value>$HADOOP_HOME/share/hadoop/mapreduce/*:$HADOOP_HOME/share/hadoop/mapreduce/lib/*</value>
</property>

<!-- local模式下不生效,限定为1,见源码org.apache.hadoop.mapred.LocalClientProtocolProvider-->
<property>
<name>mapreduce.local.map.tasks.maximum</name>
<value>10</value>
</property>
<property>
<name>mapreduce.local.reduce.tasks.maximum</name>
<value>10</value>
</property>
</configuration>

hive-site.xml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://a:3306,b:3306,c:3306/hive?autoReconnect=true&amp;createDatabaseIfNotExist=true&amp;useUnicode=true&amp;characterEncoding=utf-8&amp;failOverReadOnly=false&amp;useSSL=false</value>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.jdbc.Driver</value>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>xxx</value>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>xxx</value>
</property>

<property>
<name>hive.metastore.sasl.enabled</name>
<value>true</value>
</property>
<property>
<name>hive.metastore.kerberos.principal</name>
<value>hive/x@x</value>
</property>
<property>
<name>hive.metastore.uris</name>
<value>thrift://a:9083,thrift://b:9083,thrift://c:9083</value>
</property>
<property>
<name>hive.server2.authentication.kerberos.principal</name>
<value>hive/gzhlwtz@HLWKDC</value>
</property>

</configuration>
1
2
3
4
5
6
7
8
9
./start-thriftserver.sh \
--deploy-mode client \
--conf spark.driver.host=ip \
--name SparkJDBC \
--driver-memory 2g --executor-memory 4g --executor-cores 5 --num-executors 2 \
--hiveconf hive.server2.thrift.port=10001 \
--hiveconf hive.server2.authentication.kerberos.principal=x@x \
--hiveconf hive.server2.authentication.kerberos.keytab=/home/x.keytab \
--hiveconf hive.server2.enable.doAs=false

配置spark.driver.host指定ip来避免域名解析问题

查看文件大小并排序

按统计大小排序
hdfs dfs -dus -h /hive/warehouse/orc_db/* |sed ‘s/ //‘| sort -h

按日期排序
hdfs dfs -ls -t /hive/warehouse/orc_db

修改副本数

(部分节点无法访问时 临时增加副本提供访问能力)

hdfs dfs -setrep 4 /hive/…
hadoop dfs -D dfs.replication=2 -put abc.txt /tmp

HAR

Apache Hadoop Archives – Hadoop Archives Guide

生成

1
2
3
4
hadoop archive -archiveName foo.har -p <src-parent-dir> [-r <replication factor>] <src>* <dest>

整个目录打包
hadoop archive -archiveName zoo.har -p /foo/bar -r 3 /outputdir

提交一个MapReduce任务来生成har文件
HAR文件实际上是一个以”.har”结尾命名的目录,其中至少包含三个文件:

  1. _index // 包内目录、文件的元数据信息
  2. _masterindex // 记录了“_index”文件
  3. part-00000 … // 直接拼接了原始文件内容,无压缩处理

读取

hdfs -ls har:///har/hp2.har/hp/

释放

串行 hdfs dfs -cp har:///user/zoo/foo.har/dir1 hdfs:/user/zoo/newdir
并行 hadoop distcp har:///user/zoo/foo.har/dir1 hdfs:/user/zoo/newdir

distcp

指定队列

hadoop distcp -Dmapred.job.queue.name=root.default …
hadoop distcp -Dmapreduce.job.queuename …

hadoop jar share/hadoop/mapreduce/hadoop-mapreduce-examples-3.3.6.jar wordcount -Dmapreduce.job.queuename=ydy_bi_yarn27 input output
hadoop jar share/hadoop/mapreduce/hadoop-mapreduce-examples-3.3.6.jar pi -Dmapreduce.job.queuename=ydy_bi_yarn27 2 3

对拷sftp

sftpRemote=”/xxx”
sftp_user=”xxx”
sftp_pw=’xxx’ ##服务器密码
sftp_ip=””
sftp_port=””

hadoop distcp -D fs.sftp.impl=org.apache.hadoop.fs.sftp.SFTPFileSystem hdfs:///hive/warehouse/ads.db/user_label_iop/tp=202309 sftp://${sftp_user}:${sftp_pw}@${sftp_ip}:${sftp_port}${sftpRemote}/tpftp=202309

同时访问两个HA集群

core-site.xml
配置的当前客户端默认使用的nameservice

1
2
3
4
<property>
<name>fs.defaultFS</name>
<value>hdfs://nameservice1</value>
</property>

hdfs-site.xml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
<configuration>
<!-- services -->
<!-- local sevice and remote service -->
<property>
<name>dfs.nameservices</name>
<value>ns1,ns8</value>
</property>
<!-- clusters that Datanode will report to -->
<property>
<name>dfs.internal.nameservices </name>
<value>ns1</value>
</property>

<!-- remote namespace ns8 -->
<!-- service ns8 -->
<property>
<name>dfs.ha.namenodes.ns8</name>
<value>nn1,nn2</value>
</property>
<property>
<name>dfs.namenode.rpc-address.ns8.nn1</name>
<value>192.168.100.1:8020</value>
</property>
<property>
<name>dfs.namenode.rpc-address.ns8.nn2</name>
<value>192.168.100.2:8020</value>
</property>
<property>
<name>dfs.namenode.http-address.ns8.nn1</name>
<value>192.168.100.1:50070</value>
</property>
<property>
<name>dfs.namenode.http-address.ns8.nn2</name>
<value>192.168.100.2:50070</value>
</property>
<property>
<name>dfs.client.failover.proxy.provider.ns8</name>
<value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
</property>


<!-- local namespace ns1 -->
<!-- service ns1 -->
<property>
<name>dfs.internal.nameservices</name>
<value>ns1</value>
</property>
<property>
<name>dfs.ha.namenodes.ns1</name>
<value>nn1,nn2</value>
</property>
<property>
<name>dfs.namenode.rpc-address.ns1.nn1</name>
<value>dev01:8020</value>
</property>
<property>
<name>dfs.namenode.rpc-address.ns1.nn2</name>
<value>dev02:8020</value>
</property>
<property>
<name>dfs.namenode.http-address.ns1.nn1</name>
<value>dev01:50070</value>
</property>
<property>
<name>dfs.namenode.http-address.ns1.nn2</name>
<value>dev02:50070</value>
</property>
<property>
<name>dfs.client.failover.proxy.provider.ns1</name>
<value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
</property>
</configuration>
1
2
3
4
5
6
7
hadoop fs -ls /
hadoop fs -ls hdfs://ns1/ ##访问客户端默认的集群
hadoop fs -ls hdfs://ns8/ ##访问ns8集群

hadoop distcp -Dmapred.job.queue.name=root.userA -pb \
hdfs://ns1/user/userA/source_path/source_file \
hdfs://ns8/user/userA/dest_path

使用{}读取多个枚举值路径

${Burpoint_HDFS_PATH}/source={${data_sources}}/platform=*/year=$year/month=$month/day=$day

数据丢失处理

Safe mode is ON. The reported blocks 3 needs additional 2 blocks to reach the threshold 0.9990 of total blocks 5. The number of live datanodes 2 has reached the minimum number 0. Safe mode will be turned off automatically once the thresholds have been reached.

  • 确定zkfc启动 $HADOOP_HOME/sbin/hadoop-daemon.sh start zkfc
  • 退出安全模式 hdfs dfsadmin -safemode leave
  • 选定ActiveNN hdfs haadmin -failover nn1 nn2 (直接执行hdfs命令会报访问9000端口,standby状态不可读)
  • 检查问题文件 hdfs fsck /
  • 删除问题问题就 hdfs fsck /tmp -delete

balance

hdfs balancer -threshold 10 -blockpools BP-55455179-10.27.48.1-1677135619428 -source 10.27.48.20

threshold是控制每个DataNode的使用率不高于或者不低于集群平均的使用率

1
2
3
4
5
[-exclude [-f <hosts-file> | <comma-separated list of hosts>]]
[-include [-f <hosts-file> | <comma-separated list of hosts>]]
[-source [-f <hosts-file> | <comma-separated list of hosts>]]
[-blockpools <comma-separated list of blockpool ids>]
[-idleiterations <idleiterations>]
1
hdfs dfsadmin -setBalancerBandwidth <bandwidth in bytes per second>

200M: hdfs dfsadmin -setBalancerBandwidth 209715200

中间打印结果头为:
Time Stamp | Iteration# | Bytes Already Moved | Bytes Left To Move | Bytes Being Moved | NameNode

清空回收站

hdfs dfs -expunge

Web页面安全

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
  <property>
<name>hadoop.http.filter.initializers</name>
<value>org.apache.hadoop.security.AuthenticationFilterInitializer</value>
</property>
<property>
<name>hadoop.http.authentication.type</name>
<value>simple</value>
</property>
<property>
<name>hadoop.http.authentication.signature.secret.file</name>
<value>/opt/hadoop/secret/hadoop-http-auth-signature-secret</value>
</property>
<property>
<name>hadoop.http.authentication.simple.anonymous.allowed</name>
<value>false</value>
</property>

访问地址为secret中写入的字符 http://127.0.0.1:9870?user.name=qazwsx$123

通过nginx配置用户名密码

yum install httpd-tools
yum install httpd
启动 httpd 命令:service httpd restart
httpd 的默认安装目录在:/etc/httpd/
关于其配置可以自行查看 /etc/httpd/httpd.conf 文件
如果启动成功后,访问服务器的80端口会出现apache的welcome界面。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# 设置用户名,密码 生成 db文件
htpasswd -c /usr/local/nginx/passwd.db username password
# 查看生成的db文件内容
cat /usr/nginx/conf/htpasswd.users

vi /usr/local/nginx/conf/nginx.conf
server {
listen 50070;
server_name localhost;

location / {
auth_basic "hadoop001"; # 虚拟主机认证命名
auth_basic_user_file /usr/local/nginx/passwd.db; # 虚拟主机用户名密码认证数据库
proxy_pass http://127.0.0.1:9870; # hadoop 访问
}
}

数据权限限制

https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-hdfs/HdfsPermissionsGuide.html

  • NameNode启动进程者默认为superUser,不受限制 (禁止其他用户启动集群)
  • 普通用户使用自身账号访问文件,开放hadoop/spark/flink命令、hdfs目录、个人代码空间权限
  • 对期望限制的文件修改权限,后续文件管理由文件权限所有者或superUser处理
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
dfs.permissions.enabled = true -> hdfs-site 默认值true

dfs.permissions.superusergroup = supergroup -> hdfs-site 默认值supergroup

fs.permissions.umask-mode = 022 -> core-site 默认值022

# hadoop-policy.xml
yarn rmadmin -refreshServiceAcl
hadoop dfsadmin -refreshServiceAcl

基础开关
hadoop.security.authorization=true
控制提交任务/ JobTracker
security.job.client.protocol.acl
访问HDFS /NameNode
security.client.protocol.acl


# yarn-site

开启yarn ACLs:
hadoop: core-site.xml
hadoop.security.authorization=true #开启服务级别验证,否则hadoop组件的acl设置不生效
yarn: yarn-site.xml
<property>
<name>yarn.acl.enable</name>
<value>true</value>
</property>
<property>
<name>yarn.admin.acl</name>
<value>hdfs</value>
</property>

<property>
<name>yarn.nodemanager.default-container-executor.log-dirs.permissions</name>
<value>755</value>
</property>
$ vi $HADOOP_CONF_DIR/capacity-scheduler.xml

$ yarn rmadmin -refreshQueues
yarn.scheduler.capacity.root.<queue-path>.acl_submit_applications

使用者操作方式

启动:使用hdfs用户启动hadoop服务

任务部署:ds, 上传jar包、运行任务都将以hadoop运行,生产是数据hadoop写出为755

个人调试/查看数据:可以本地执行hadoop/spark/flink命令执行

数据保护:定期修改关键数据为hdfs用户下755,只有hdfs可删除文件

hdfs dfs -chown hdfs xxx

hdfs dfs -chmod  -R 755 xxx

配置权限

hdfs dfs -getfacl /

hdfs dfs -setfacl -m user:hue:rwx /warehouse/tablespace/managed/hive

GroupMapping配置

参考Hadoop官网配置分组,如果没有配置就fallback到User级别

core-site.xml优先按照配置的静态映射确定组,默认值为dr.who=;

1
2
3
4
<property>
<name>hadoop.user.group.static.mapping.overrides</name>
<value>user1=group1,group2;user2=;user3=group2</value>
</property>

无匹配的静态项则使用配置映射service provider为hadoop.security.group.mapping

配置静态映射后刷新配置:
hdfs dfsadmin -refreshUserToGroupsMappings

–只配置nn,但需要重启nn

hadoop用户和权限 - 过雁 - 博客园 (cnblogs.com)

HDFS文件清理巡检

防止使用者乱放文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# modified 20230303
KNOWN_FILE="^/data$"
KNOWN_FILE=$KNOWN_FILE"|^/dolphinscheduler$"
KNOWN_FILE=$KNOWN_FILE"|^/flink$"
KNOWN_FILE=$KNOWN_FILE"|^/hive$"
KNOWN_FILE=$KNOWN_FILE"|^/spark$"
KNOWN_FILE=$KNOWN_FILE"|^/tmp$"
KNOWN_FILE=$KNOWN_FILE"|^/user$"
KNOWN_FILE=$KNOWN_FILE"|^/yarn$"
hdfs dfs -ls -C / |grep -Ev $KNOWN_FILE

KNOWN_FILE="^/data/huadan$"
KNOWN_FILE=$KNOWN_FILE"|^/data/env$"
hdfs dfs -ls -C /data |grep -Ev $KNOWN_FILE

KNOWN_FILE="^/dolphinscheduler/hadoop$"
hdfs dfs -ls -C /dolphinscheduler |grep -Ev $KNOWN_FILE

KNOWN_FILE="^/flink/completed-jobs$"
hdfs dfs -ls -C /flink |grep -Ev $KNOWN_FILE

KNOWN_FILE="^/spark/warehouse$"
KNOWN_FILE=$KNOWN_FILE"|^/spark/spark-history$"
KNOWN_FILE=$KNOWN_FILE"|^/spark/spark-jar.zip$"
hdfs dfs -ls -C /spark |grep -Ev $KNOWN_FILE

KNOWN_FILE="^/hive/warehouse$"
KNOWN_FILE=$KNOWN_FILE"|^/hive/tmp$"
hdfs dfs -ls -C /hive |grep -Ev $KNOWN_FILE


KNOWN_FILE="^/hive/warehouse/ads.db$"
KNOWN_FILE=$KNOWN_FILE"|^/hive/warehouse/default.db$"
KNOWN_FILE=$KNOWN_FILE"|^/hive/warehouse/dim.db$"
KNOWN_FILE=$KNOWN_FILE"|^/hive/warehouse/dm.db$"
KNOWN_FILE=$KNOWN_FILE"|^/hive/warehouse/dw.db$"
KNOWN_FILE=$KNOWN_FILE"|^/hive/warehouse/dwd.db$"
KNOWN_FILE=$KNOWN_FILE"|^/hive/warehouse/lg.db$"
KNOWN_FILE=$KNOWN_FILE"|^/hive/warehouse/mid.db$"
KNOWN_FILE=$KNOWN_FILE"|^/hive/warehouse/ods.db$"
KNOWN_FILE=$KNOWN_FILE"|^/hive/warehouse/oracle_orc_fromdatax.db$"
KNOWN_FILE=$KNOWN_FILE"|^/hive/warehouse/tmp.db$"
hdfs dfs -ls -C /hive/warehouse |grep -Ev $KNOWN_FILE


查询在读写的hive表

1
grep '^2024-01-10'  /data02/hadoop/log/hadoop-hadoop-namenode-master.log|grep completeFile  |awk '{print $7}' |grep '^/hive/warehouse' |awk -F'/' '{print $1"/"$2"/"$3"/"$4"/"$5}'|sort -u

文件使用检测-审计

背景: 大量数据文件生成,哪些是不用的,需要判断以清理或冷数据处理

配置debug级别才打印的命令(查询表的时候是 cmd=open)

1
dfs.namenode.audit.log.debug.cmdlist=getfileinfo,listStatus

开启审计(需要重启NN,因为会用在jvm启动命令上)

1
export HDFS_AUDIT_LOGGER=INFO,RFAAUDIT

$HADOOP_HOME/sbin/hadoop-daemon.sh stop namenode
$HADOOP_HOME/sbin/hadoop-daemon.sh start namenode

所有访问IP

awk ‘{print $8}’ hdfs-audit.log | sort -u

远程拉取的文件是否在用

所有create操作
cat hdfs-audit.log |grep cmd=create |awk ‘{print $8,$9,$10}’ | awk -F’/‘ ‘{print $2”/“$3”/“$4”/“$5”/“$6}’ |sort -u > check_create

所有open操作
cat hdfs-audit.log |grep cmd=open |grep -v application | grep -v inprogress |awk ‘{print $8,$9,$10}’ | awk -F’/‘ ‘{print $2”/“$3”/“$4”/“$5”/“$6}’ |sort -u > check_open

查看远程集群create的表 /hive/warehouse/xx.db/xxx
grep “10.27.48” check_create |awk -F’/‘ ‘{print “/“$2”/“$3”/“$4”/“$5}’ |sort -u

查看本地集群open的表 /hive/warehouse/xx.db/xxx
grep -v “10.27.48” check_open |grep -v Staging | awk -F’/‘ ‘{print “/“$2”/“$3”/“$4”/“$5}’ |sort -u

校验
grep KEYWORDS hdfs-audit.log |awk ‘{print $8,$9,$10}’

create的文件是否在用

查看当前打开的文件
hdfs dfsadmin -listOpenFiles

表名前缀分组统计

1
hdfs dfs -ls /hive/warehouse/tmp.db/ |awk '{print $8}' |awk -F'/' '{print $5}' |awk -F'_' '{print $1}' | sort |uniq -c |sort -h

节点管理-退役

Apache Hadoop 3.3.6 – HDFS DataNode Admin Guide

使用场景:节点退服、节点维护,作用是告诉namenode该节点后续的安排以控制数据副本复制的操作,比如维护状态就明确节点只是临时下线不需要进行副本复制,减少不必要的IO

配置方式1: Hostname-only

该方式只支持decommission and recommission,不支持maintenance

echo “10.17.41.133” > datanode.excludes
echo “10.17.41.133” > datanode.includes

vi hdfs-site.xml (dfs.hosts and dfs.hosts.exclude)

1
2
3
4
<property> 
<name>dfs.hosts.exclude</name>
<value>/data/hadoop-2.8.3/etc/hadoop/datanode.excludes</value>
</property>

hdfs dfsadmin -refreshNodes

显示过滤后的16行
hdfs dfsadmin -report |grep -A 16 10.17.41.26

配置方式2: Json

hdfs-site.xml修改以下classname需要重启NN

hdfs-site.xml
1
2
3
4
5
6
7
8
<property> 
<name>dfs.namenode.hosts.provider.classname</name>
<value>org.apache.hadoop.hdfs.server.blockmanagement.CombinedHostFileManager</value>
</property>
<property>
<name>dfs.hosts</name>
<value>/data/soft/hadoop/etc/hadoop/datanode.json</value>
</property>

不配置Normal状态的节点,表示所有其他节点都可以注册到Namenode
admin state. The default value is NORMALDECOMMISSIONED for decommission; IN_MAINTENANCE for maintenance state.

datanode.json
1
2
3
4
5
6
7
[
{
"hostName": "10.27.48.2",
"maintenanceExpireTimeInMS": "120000", # The default value is forever.
"adminState": "IN_MAINTENANCE"
}
]
1
2
3
4
5
6
7
8

$HADOOP_HOME/sbin/hadoop-daemon.sh stop namenode
$HADOOP_HOME/sbin/hadoop-daemon.sh start namenode
$HADOOP_HOME/sbin/hadoop-daemon.sh stop datanode
$HADOOP_HOME/sbin/hadoop-daemon.sh start datanode
$HADOOP_HOME/sbin/hadoop-daemon.sh stop nodemanager
$HADOOP_HOME/sbin/hadoop-daemon.sh start nodemanager

主备切换

hdfs haadmin -getAllServiceState
hdfs haadmin -failover nn1 nn2

无须重启 刷新配置

reconfigureProperty比如fs.protected.directories指定保护以免删除的目录

hdfs dfsadmin -reconfig <datanodenamenode <host:ipc_port> <start|status|properties>]

properties命令查看所有支持刷新的配置项,start启动刷新,status查看结果

DEBUG

1
2
3
4
[Hadoop开启关闭调试信息]
开启:export HADOOP_ROOT_LOGGER=DEBUG,console

关闭:export HADOOP_ROOT_LOGGER=INFO,console

Benchmark

TestDFSIO

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
# 写入测试
hadoop jar ./share/hadoop/mapreduce/hadoop-mapreduce-client-jobclient-3.2.4.jar org.apache.hadoop.fs.TestDFSIO -D test.build.data=/user/ydy_bi_user48/benchmarks/TestDFSIO -write -nrFiles 10 -size 1000MB

hadoop jar ./share/hadoop/mapreduce/hadoop-mapreduce-client-jobclient-2.10.2.jar TestDFSIO -D test.build.data=/tmp/benchmarks/TestDFSIO -write -nrFiles 10 -size 1000MB


# 读取测试
hadoop jar ./share/hadoop/mapreduce/hadoop-mapreduce-client-jobclient-3.2.4.jar org.apache.hadoop.fs.TestDFSIO -D test.build.data=/user/ydy_bi_user48/benchmarks/TestDFSIO -read -nrFiles 10 -size 100MB

hadoop jar ./share/hadoop/mapreduce/hadoop-mapreduce-client-jobclient-2.10.2.jar TestDFSIO -D test.build.data=/tmp/benchmarks/TestDFSIO -read -nrFiles 10 -size 100MB


# 清理测试数据
hadoop jar ./share/hadoop/mapreduce/hadoop-mapreduce-client-jobclient-3.2.4.jar org.apache.hadoop.fs.TestDFSIO -D test.build.data=/user/ydy_bi_user48/benchmarks/TestDFSIO -clean

hadoop jar ./share/hadoop/mapreduce/hadoop-mapreduce-client-jobclient-2.10.2.jar TestDFSIO -D test.build.data=/tmp/benchmarks/TestDFSIO -clean


0%