Dockerfile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
|
FROM eclipse-temurin:8
LABEL maintainer="yijiong" github="github.com/y1jiong"
# configure apt and install packages
COPY sources.list /etc/apt/sources.list
RUN apt update && \
apt install -y \
openssh-server \
openssh-client
# configure ssh
COPY ssh-key/* /root/.ssh/
RUN echo "service ssh start" >> ~/.bashrc
# install hadoop
COPY hadoop-3.3.6.tar.gz /tmp/hadoop-3.3.6.tar.gz
RUN cd /tmp && \
tar -zxvf hadoop-3.3.6.tar.gz && \
mv hadoop-3.3.6 /usr/local/hadoop && \
rm -f /tmp/hadoop-3.3.6.tar.gz && \
cd -
ENV HADOOP_HOME=/usr/local/hadoop
ENV PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
ENV HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
ENV HADOOP_MAPRED_HOME=$HADOOP_HOME
ENV HADOOP_COMMON_HOME=$HADOOP_HOME
ENV HADOOP_HDFS_HOME=$HADOOP_HOME
ENV YARN_HOME=$HADOOP_HOME
ENV HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native
ENV HADOOP_OPTS="-Djava.library.path=$HADOOP_HOME/lib/native"
ENV JAVA_HOME=/opt/java/openjdk
ENV HADOOP_YARN_HOME=$HADOOP_HOME
ENV HADOOP_INSTALL=$HADOOP_HOME
ENV HADOOP_LIBEXEC_DIR=$HADOOP_HOME/libexec
ENV JAVA_LIBRARY_PATH=$HADOOP_HOME/lib/native:$JAVA_LIBRARY_PATH
ENV HDFS_DATANODE_USER=root
ENV HDFS_DATANODE_SECURE_USER=root
ENV HDFS_SECONDARYNAMENODE_USER=root
ENV HDFS_NAMENODE_USER=root
ENV YARN_RESOURCEMANAGER_USER=root
ENV YARN_NODEMANAGER_USER=root
RUN echo "export JAVA_HOME=/opt/java/openjdk" >> /usr/local/hadoop/etc/hadoop/hadoop-env.sh && \
echo "export HDFS_NAMENODE_USER=root" >> /usr/local/hadoop/etc/hadoop/hadoop-env.sh && \
echo "export HDFS_DATANODE_USER=root" >> /usr/local/hadoop/etc/hadoop/hadoop-env.sh && \
echo "export HDFS_SECONDARYNAMENODE_USER=root" >> /usr/local/hadoop/etc/hadoop/hadoop-env.sh && \
echo "export YARN_RESOURCEMANAGER_USER=root" >> /usr/local/hadoop/etc/hadoop/hadoop-env.sh && \
echo "export YARN_NODEMANAGER_USER=root" >> /usr/local/hadoop/etc/hadoop/hadoop-env.sh
# hadoop config
COPY etc/* /usr/local/hadoop/etc/hadoop/
# start up
#CMD start-all.sh
|
需要准备的目录和文件
1
2
|
$ ls
Dockerfile etc/ hadoop-3.3.6.tar.gz sources.list ssh-key/
|
etc/
1
2
|
$ ls etc/
core-site.xml hdfs-site.xml mapred-site.xml workers yarn-site.xml
|
core-site.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
|
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<name>fs.default.name</name>
<value>hdfs://h01:9000</value>
</property>
<property>
<name>fs.defaultFS</name>
<value>hdfs://h01:8020</value>
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>/home/hadoop3/hadoop/tmp</value>
</property>
<property>
<name>hadoop.http.staticuser.user</name>
<value>root</value>
</property>
<property>
<name>hadoop.proxyuser.root.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.root.groups</name>
<value>*</value>
</property>
</configuration>
|
hdfs-site.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
|
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<name>dfs.replication</name>
<value>2</value>
</property>
<property>
<name>dfs.namenode.name.dir</name>
<value>/home/hadoop3/hadoop/hdfs/name</value>
</property>
<property>
<name>dfs.namenode.data.dir</name>
<value>/home/hadoop3/hadoop/hdfs/data</value>
</property>
<property>
<name>dfs.namenode.http-address</name>
<value>h01:9870</value>
</property>
<property>
<name>dfs.namenode.secondary.http-address</name>
<value>h03:9868</value>
</property>
</configuration>
|
mapred-site.xml
1
2
3
4
5
6
7
8
9
|
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
</configuration>
|
workers
这里演示 3 个节点
yarn-site.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
|
<?xml version="1.0"?>
<configuration>
<!-- Site specific YARN configuration properties -->
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.resourcemanager.hostname</name>
<value>h02</value>
</property>
<property>
<name>yarn.nodemanager.env-whitelist</name>
<value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME</value>
</property>
</configuration>
|
ssh-key/
1
2
3
|
cd ssh-key/
ssh-keygen -t rsa -b 4096 -P '' -f id_rsa
cp id_rsa.pub authorized_keys
|
sources.list
这里是为了换源
1
2
|
cp /etc/apt/sources.list .
sed -i 's/archive.ubuntu.com/mirrors.tuna.tsinghua.edu.cn/g' sources.list
|
hadoop-3.3.6.tar.gz
从 官网 下载
版本号可能不一样,可以选择最新的版本,也可以选择这里示例的版本
构建镜像
1
|
docker build -t y1jiong/hadoop:3.3.6 .
|
启动容器
建议新建一个网络
1
|
docker network create hadoop
|
启动集群
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
|
docker run -itd \
--name h01 \
--hostname h01 \
--network hadoop \
-v /opt/hadoop/fs/h01:/home/hadoop3/hadoop \
y1jiong/hadoop:3.3.6 \
bash
docker run -itd \
--name h02 \
--hostname h02 \
--network hadoop \
-v /opt/hadoop/fs/h02:/home/hadoop3/hadoop \
y1jiong/hadoop:3.3.6 \
bash
docker run -itd \
--name h03 \
--hostname h03 \
--network hadoop \
-v /opt/hadoop/fs/h03:/home/hadoop3/hadoop \
y1jiong/hadoop:3.3.6 \
bash
docker attach h01
start-all.sh
# ctrl + p + q 是仅退出容器但不停止容器
[^pq]
|
连接集群
可用 n2n,这里不再赘述
参考:n2n
别忘了改 hosts
1
2
3
|
172.x.0.x h01
172.x.0.x h02
172.x.0.x h03
|
其他工具
Hive
参考:Hadoop Hive Docker
Docker
https://hub.docker.com/r/apache/hive
Spark
Docker
https://hub.docker.com/r/bitnami/spark
https://hub.docker.com/_/spark