Featured image of post Hadoop Docker

Hadoop Docker

Dockerfile

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
FROM eclipse-temurin:8
LABEL maintainer="yijiong" github="github.com/yzy613"

# configure apt and install packages
COPY sources.list /etc/apt/sources.list
RUN apt update && \
	apt install -y \
		openssh-server \
		openssh-client

# configure ssh
COPY ssh-key/* /root/.ssh/
RUN echo "service ssh start" >> ~/.bashrc

# install hadoop
COPY hadoop-3.3.6.tar.gz /tmp/hadoop-3.3.6.tar.gz
RUN cd /tmp && \
	tar -zxvf hadoop-3.3.6.tar.gz && \
	mv hadoop-3.3.6 /usr/local/hadoop && \
	rm -f /tmp/hadoop-3.3.6.tar.gz && \
	cd -

ENV HADOOP_HOME=/usr/local/hadoop
ENV PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
ENV HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
ENV HADOOP_MAPRED_HOME=$HADOOP_HOME
ENV HADOOP_COMMON_HOME=$HADOOP_HOME
ENV HADOOP_HDFS_HOME=$HADOOP_HOME
ENV YARN_HOME=$HADOOP_HOME
ENV HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native
ENV HADOOP_OPTS="-Djava.library.path=$HADOOP_HOME/lib/native"

ENV JAVA_HOME=/opt/java/openjdk
ENV HADOOP_YARN_HOME=$HADOOP_HOME
ENV HADOOP_INSTALL=$HADOOP_HOME
ENV HADOOP_LIBEXEC_DIR=$HADOOP_HOME/libexec
ENV JAVA_LIBRARY_PATH=$HADOOP_HOME/lib/native:$JAVA_LIBRARY_PATH
ENV HDFS_DATANODE_USER=root
ENV HDFS_DATANODE_SECURE_USER=root
ENV HDFS_SECONDARYNAMENODE_USER=root
ENV HDFS_NAMENODE_USER=root
ENV YARN_RESOURCEMANAGER_USER=root
ENV YARN_NODEMANAGER_USER=root

RUN echo "export JAVA_HOME=/opt/java/openjdk" >> /usr/local/hadoop/etc/hadoop/hadoop-env.sh && \
	echo "export HDFS_NAMENODE_USER=root" >> /usr/local/hadoop/etc/hadoop/hadoop-env.sh && \
	echo "export HDFS_DATANODE_USER=root" >> /usr/local/hadoop/etc/hadoop/hadoop-env.sh && \
	echo "export HDFS_SECONDARYNAMENODE_USER=root" >> /usr/local/hadoop/etc/hadoop/hadoop-env.sh && \
	echo "export YARN_RESOURCEMANAGER_USER=root" >> /usr/local/hadoop/etc/hadoop/hadoop-env.sh && \
	echo "export YARN_NODEMANAGER_USER=root" >> /usr/local/hadoop/etc/hadoop/hadoop-env.sh

# hadoop config
COPY etc/* /usr/local/hadoop/etc/hadoop/

# start up
#CMD start-all.sh

需要准备的目录和文件

1
2
$ ls
Dockerfile  etc/  hadoop-3.3.6.tar.gz  sources.list  ssh-key/

etc/

1
2
$ ls etc/
core-site.xml  hdfs-site.xml  mapred-site.xml  workers  yarn-site.xml

core-site.xml

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!-- Put site-specific property overrides in this file. -->
<configuration>
    <property>
        <name>fs.default.name</name>
        <value>hdfs://h01:9000</value>
    </property>
    <property>
        <name>fs.defaultFS</name>
        <value>hdfs://h01:8020</value>
    </property>
    <property>
        <name>hadoop.tmp.dir</name>
        <value>/home/hadoop3/hadoop/tmp</value>
    </property>
    <property>
        <name>hadoop.http.staticuser.user</name>
        <value>root</value>
    </property>
    <property>
        <name>hadoop.proxyuser.root.hosts</name>
        <value>*</value>
    </property>
    <property>
        <name>hadoop.proxyuser.root.groups</name>
        <value>*</value>
    </property>
</configuration>

hdfs-site.xml

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!-- Put site-specific property overrides in this file. -->
<configuration>
    <property>
        <name>dfs.replication</name>
        <value>2</value>
    </property>
    <property>
        <name>dfs.namenode.name.dir</name>
        <value>/home/hadoop3/hadoop/hdfs/name</value>
    </property>
    <property>
        <name>dfs.namenode.data.dir</name>
        <value>/home/hadoop3/hadoop/hdfs/data</value>
    </property>
    <property>
      <name>dfs.namenode.http-address</name>
      <value>h01:9870</value>
    </property>
    <property>
      <name>dfs.namenode.secondary.http-address</name>
      <value>h03:9868</value>
    </property>
</configuration>

mapred-site.xml

1
2
3
4
5
6
7
8
9
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!-- Put site-specific property overrides in this file. -->
<configuration>
    <property>
        <name>mapreduce.framework.name</name>
        <value>yarn</value>
    </property>
</configuration>

workers

这里演示 3 个节点

1
2
3
h01
h02
h03

yarn-site.xml

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
<?xml version="1.0"?>
<configuration>
<!-- Site specific YARN configuration properties -->
    <property>
        <name>yarn.nodemanager.aux-services</name>
        <value>mapreduce_shuffle</value>
    </property>
    <property>
        <name>yarn.resourcemanager.hostname</name>
        <value>h02</value>
    </property>
    <property>
        <name>yarn.nodemanager.env-whitelist</name>
        <value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME</value>
    </property>
</configuration>

ssh-key/

1
2
3
cd ssh-key/
ssh-keygen -t rsa -b 4096 -P '' -f id_rsa
cp id_rsa.pub authorized_keys

sources.list

这里是为了换源

1
2
cp /etc/apt/sources.list .
sed -i 's/archive.ubuntu.com/mirrors.tuna.tsinghua.edu.cn/g' sources.list

hadoop-3.3.6.tar.gz

官网 下载

版本号可能不一样,可以选择最新的版本,也可以选择这里示例的版本

构建镜像

1
docker build -t yzy613/hadoop:3.3.6 .

启动容器

建议新建一个网络

1
docker network create hadoop

启动集群

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
docker run -itd \
    --name h01 \
    --hostname h01 \
    --network hadoop \
    -v /opt/hadoop/fs/h01:/home/hadoop3/hadoop \
    yzy613/hadoop:3.3.6 \
    bash
docker run -itd \
    --name h02 \
    --hostname h02 \
    --network hadoop \
    -v /opt/hadoop/fs/h02:/home/hadoop3/hadoop \
    yzy613/hadoop:3.3.6 \
    bash
docker run -itd \
    --name h03 \
    --hostname h03 \
    --network hadoop \
    -v /opt/hadoop/fs/h03:/home/hadoop3/hadoop \
    yzy613/hadoop:3.3.6 \
    bash
docker attach h01
start-all.sh
# ctrl + p + q 是仅退出容器但不停止容器
[^pq]

连接集群

可用 n2n,这里不再赘述

参考:n2n

别忘了改 hosts

1
2
3
172.x.0.x h01
172.x.0.x h02
172.x.0.x h03

其他工具

Hive

参考:Hadoop Hive Docker

Docker

https://hub.docker.com/r/apache/hive

Spark

Docker

https://hub.docker.com/r/bitnami/spark

https://hub.docker.com/_/spark