基于Mac M1[ARM64]环境下Docker部署大数据集群


一 机器依赖(CentOS7)[重要]

yum -y install \
vim \
sudo \
net-tools.aarch64 \
nmap-ncat.aarch64 \
telnet \
openssh-server \


ssh-keygen -t rsa -f /etc/ssh/ssh_host_rsa_key
ssh-keygen -t ecdsa -f /etc/ssh/ssh_host_ecdsa_key
ssh-keygen -t ed25519 -f /etc/ssh/ssh_host_ed25519_key

二 基础环境规划

  1. 软件版本

    软件 版本
    Hadoop hadoop-2.10.1.tar.gz
    Hive apache-hive-2.3.1-bin.tar.gz
    Kafka kafka_2.12-2.0.0.tgz
    Zookeeper apache-zookeeper-3.5.7-bin.tar.gz
    Hbase hbase-2.4.9-bin.tar.gz
    Java jdk-8u202-linux-arm64-vfp-hflt.tar.gz
    Scala scala-2.12.15.tgz
    Spark spark-3.3.2-bin-without-hadoop.tgz
    Hudi hudi-release-0.13.0.zip
    Doris apache-doris-1.2.6-bin-arm64.tar.xz
    Flink flink-1.16.0-bin-scala_2.12.tgz
    Clickhouse clickhouse-
  2. 组件分布

    节点 组件列表
    hadoop-node1 namenode | dataname | resourcemanager | nodemanager | hive | spark | flink | fe be | clickhouse
    Hadoop-node2 datanode |nodemanager | mysql | zookeeper | hive | hive.metastore | hbase | fe be | clickhouse
    Hadoop-node3 datanode |nodemanager | kafka | hive | fe be |clichhouse
  3. 端口注册

  4. 各节点节点启动脚本整合

    • hadoop-node1 启动脚本

      ## env 
      source ~/.base_profile
      ## hdfs 
      sbin/hadoop-daemon.sh start namenode
      sbin/hadoop-daemon.sh start datanode
      ## yarn
      sbin/yarn-daemon.sh start resourcemanager
      sbin/yarn-daemon.sh start nodemanager
      ## doris
      fe/bin/start_fe.sh --daemon
      be/bin/start_be.sh --daemo
    • Hadoop-node2 启动脚本

      ## env 
      source ~/.base_profile
      ## hdfs 
      sbin/hadoop-daemon.sh start datanode
      ## yarn
      sbin/yarn-daemon.sh start nodemanager
      ## mysql 
      systemctl start mariadb
      ## doris
      fe/bin/start_fe.sh --daemon
      be/bin/start_be.sh --daemo
      ## hbase 
      ## zookeeper 
      bin/zkServer.sh start
      ## hive-metastore 
      nohup hive --service metastore >> /opt/data/hive/hive-metastore.log &
    • Hadoop-node3 启动脚本

      ## env 
      source ~/.base_profile
      ## hdfs 
      sbin/hadoop-daemon.sh start datanode
      ## yarn
      sbin/yarn-daemon.sh start nodemanager
      ## doris
      fe/bin/start_fe.sh --daemon
      be/bin/start_be.sh --daemo
      ## kafka 
      kafka-server-start.sh  -daemon config/server.properties &
  5. 启动3个节点docker容器[重要]


    docker run -itd \
    -h hadoop-node1 \
    --name=hadoop-node1 \
    --privileged=true \
    --network=gawyn-bridge \
    -v /Users/chavinking/gawyn/hadoop-node1:/opt \
    centos:centos7 \


    docker run -itd \
    -h hadoop-node2 \
    --name=hadoop-node2 \
    --privileged=true \
    --network=gawyn-bridge \
    -v /Users/chavinking/gawyn/hadoop-node2:/opt \
    centos:centos7 \


    docker run -itd \
    -h hadoop-node3 \
    --name=hadoop-node3 \
    --privileged=true \
    --network=gawyn-bridge \
    -v /Users/chavinking/gawyn/hadoop-node3:/opt \
    centos:centos7 \

三 配置环境变量


## Java
export JAVA_HOME=/opt/system/jdk1.8.0_202
export PATH=$JAVA_HOME/bin:$JAVA_HOME/jre/bin:$PATH:.

## scala
export SCALA_HOME=/opt/system/scala-2.12.15
export PATH=$SCALA_HOME/bin:$PATH:.

## maven profile
export MAVEN_HOME=/opt/system/maven-3.5.0
export PATH=$MAVEN_HOME/bin:$PATH:.

## Hadoop Env
export HADOOP_HOME=/opt/system/hadoop-2.10.1

## ZooKeeper Env
export ZOOKEEPER_HOME=/opt/system/zookeeper-3.5.7

## kafka Env
export KAFKA_HOME=/opt/system/kafka-2.12-2.0.0
export PATH=$PATH:$KAFKA_HOME/bin:.

## Hive Env
export HIVE_HOME=/opt/system/hive-2.3.1
export PATH=$PATH:$HIVE_HOME/bin:.

## Hbase Env 
export HBASE_HOME=/opt/system/hbase-2.4.9
export PATH=$PATH:$HBASE_HOME/bin:.

## Spark Env 
export SPARK_HOME=/opt/system/spark-3.3.2
export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin:.

## Doris Env 
export DORIS_HOME=/opt/system/doris-1.2.6
export PATH=$PATH:$DORIS_HOME/fe/bin:$DORIS_HOME/be/bin:.

## Init Env
sysctl -w vm.max_map_count=2000000
ulimit -n 65536

将source /opt/runner/docker-env.sh添加到~/.bash_profile 文件。

四 配置ssh授信

  1. 为centos7用户设置密码

    passwd root
  2. yum安装ssh服务端和客户端依赖,见步骤1

  3. 启动sshd服务



    [root@hadoop-node2 ~]# /usr/sbin/sshd

    Could not load host key: /etc/ssh/ssh_host_rsa_key

    Could not load host key: /etc/ssh/ssh_host_ecdsa_key

    Could not load host key: /etc/ssh/ssh_host_ed25519_key

    sshd: no hostkeys available – exiting.


    ssh-keygen -t rsa -f /etc/ssh/ssh_host_rsa_key

    ssh-keygen -t ecdsa -f /etc/ssh/ssh_host_ecdsa_key

    ssh-keygen -t ed25519 -f /etc/ssh/ssh_host_ed25519_key

  4. 检查sshd是否启动

    % ps -ef|grep sshd
    root       145     1  0 06:55 ?        00:00:00 /usr/sbin/sshd
    % telnet 22
    Connected to
    Escape character is '^]'.
  5. 节点间同步ssh授信文件,每个节点分别执行

    % ssh-keygen -t rsa
    % ssh-copy-id hadoop-node1
    % ssh hadoop-node1 date
    % ssh-copy-id hadoop-node2
    % ssh hadoop-node2 date
    % ssh-copy-id hadoop-node3
    % ssh hadoop-node3 date

五 部署Hadoop分布式集群

  1. 下载软件包并且解压软件

  2. 编辑hadoop配置文件,需要编辑配置文件如下:













  3. 编辑后的配置文件内容如下:

    • etc/hadoop/core-site.xml
    <?xml version="1.0" encoding="UTF-8"?>
    <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
    <!-- Put site-specific property overrides in this file. -->
    • etc/hadoop/hdfs-site.xml
    <?xml version="1.0" encoding="UTF-8"?>
    <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
    <!-- Put site-specific property overrides in this file. -->
    • etc/hadoop/yarn-site.xml
    <?xml version="1.0"?>
    		<!-- Site specific YARN configuration properties -->
    • etc/hadoop/slaves
    • 配置Java环境变量
    • 创建数据目录
    • 同步hadoop-node1节点文件到其他节点
    % scp -r * hadoop-node2:/opt/
    % scp -r * hadoop-node3:/opt/
  4. 格式化HDFS

    % hdfs namenode -format
  5. 启动Hadoop服务

        sbin/hadoop-daemon.sh start|stop namenode
        sbin/hadoop-daemon.sh start|stop datanode
        sbin/yarn-daemon.sh start|stop resourcemanager
        sbin/yarn-daemon.sh start|stop nodemanager

六 部署MySQL服务

  1. 以yum方式简易安装

    % yum -y install mariadb mariadb-server
  2. 启动mysql服务

    % systemctl start mariadb



    systemctl status mariadb

    Failed to get D-Bus connection: No such file or directory


    vim ~/Library/Group\ Containers/group.com.docker/settings.json

    修改:“deprecatedCgroupv1”: false 为 “deprecatedCgroupv1”: true


七 部署Zookeeper服务

  1. 下载软件并解压到安装目录,并且创建数据目录/opt/data/zkdata,配置环境变量

  2. 配置zookeeper配置文件

    # conf/zoo.cfg
  3. 启动zookeeper

    bin/zkServer.sh start

八 部署Kafka服务

  1. 下载软件并解压到安装目录,并且创建数据目录/opt/data/kafka-logs,配置环境变量

  2. 配置kafka配置文件

    # config/server.properties
  3. 启动kafka服务

    kafka-server-start.sh  -daemon config/server.properties &
  4. kafka常用命令

    kafka-topics.sh --create --zookeeper hadoop-node2:2181 --replication-factor 1 --partitions 1 --topic test
    kafka-topics.sh --list --zookeeper hadoop-node2:2181
    kafka-console-producer.sh --broker-list hadoop-node3:9092 --topic test
    kafka-console-consumer.sh --bootstrap-server hadoop-node3:9092 --from-beginning --topic test
    kafka-console-consumer.sh --zookeeper hadoop-node2:2181 --topic test --from-beginning

九 部署Hive服务

  1. 下载软件并解压到安装目录,配置环境变量

  2. 拷贝mysql jdbc依赖jar包到hive安装目录lib/文件夹内

  3. 配置Hive配置文件

    • hive-env.sh

    cp hive-env.sh.template hive-env.sh

    export HIVE_CONF_DIR=/opt/system/hive-2.3.1/conf
    • hive-site.xml

    cp hive-default.xml.template hive-site.xml

    <!-- hive metastore config -->
            <description>JDBC connect string for a JDBC metastore</description>
            <description>Driver class name for a JDBC metastore</description>
            <description>username to use against metastore database</description>
            <description>password to use against metastore database</description>
    <!-- hive warehouse dir -->
            <description>location of default database for the warehouse</description>
    <!-- java.io.tmpdir -->
            <description>Local scratch space for Hive jobs</description>
    <!-- hive metastore config -->
            <description>Hive metastore listener port</description>
  4. 创建hive仓库hdfs目录与hive元数据库并初始化元数据库

    % hdfs dfs -mkdir /user/root/warehouse
    % mkdir /opt/data/hive/tmp
    mysql> create database hive;
    mysql> grant all privileges on *.* to 'root'@'hadoop-node2' identified by 'mysql' with grant option;
    mysql> flush privileges;
    # bin/schematool -initSchema -dbType mysql
  5. 启动hive metastore服务

    % nohup hive --service metastore >> /opt/data/hive/hive-metastore.log &

十 部署Hbase服务

  1. 下载软件并解压到安装目录,配置环境变量

  2. 配置Hbase配置文件

    • hbase-env.sh
    export JAVA_HOME=/opt/system/jdk1.8.0_202
    export HADOOP_HOME=/opt/system/hadoop-2.10.1
    export HBASE_MANAGES_ZK=false
    • hbase-site.xml
    • regionservers
  3. 创建目录

    % hdfs dfs -mkdir /user/root/hbase
    % mkdir -p /opt/data/hbase/tmp
  4. 启动hbase服务

    % bin/start-hbase.sh
  5. 测试hbase集群

    % hbase shell
    hbase:001:0> status
    1 active master, 0 backup masters, 1 servers, 0 dead, 2.0000 average load
    Took 1.3384 seconds
    hbase:002:0> create 'testtable','colfaml'
    Created table testtable
    Took 0.7568 seconds
    => Hbase::Table - testtable
    hbase:003:0>  list 'testtable'
    1 row(s)
    Took 0.0390 seconds
    => ["testtable"]
    hbase:004:0> put 'testtable','myrow-1','colfaml:q1','value-1'
    Took 0.3415 seconds
    hbase:005:0> put 'testtable','myrow-2','colfaml:q2','value-2'
    Took 0.0067 seconds
    hbase:006:0> scan 'testtable'
    ROW                                                          COLUMN+CELL
     myrow-1                                                     column=colfaml:q1, timestamp=2023-08-08T06:14:14.685, value=value-1
     myrow-2                                                     column=colfaml:q2, timestamp=2023-08-08T06:14:19.278, value=value-2
    2 row(s)
    Took 0.0372 seconds
    hbase:007:0> get 'testtable','myrow-1'
    COLUMN                                                       CELL
     colfaml:q1                                                  timestamp=2023-08-08T06:14:14.685, value=value-1
    1 row(s)
    Took 0.0424 seconds

十一 部署Spark服务

  1. 下载软件并解压到安装目录,执行编译

    mvn clean package -DskipTests -Pyarn -Phadoop-2 -Dhadoop.version=2.10.1 -Phive -Phive-thriftserver 
  2. 配置环境变量,配置spark-env.sh文件[without-hadoop版本配置参数]

    export JAVA_HOME=/opt/system/jdk1.8.0_202
    export HADOOP_CONF_DIR=/opt/system/hadoop-2.10.1/etc/hadoop
    export YARN_CONF_DIR=/opt/system/hadoop-2.10.1/etc/hadoop
    export SPARK_DIST_CLASSPATH=$(hadoop classpath)
    export LD_LIBRARY_PATH=$HADOOP_HOME/lib/native
  3. 拷贝hive配置文件到spark配置目录

    cp /opt/system/hive-2.3.1/conf/hive-site.xml /opt/system/spark-3.3.2/conf/hive-site.xml
  4. 测试spark

    % spark-shell
    % spark-sql
  • 编译遇到问题

    • 问题1

      [ERROR] Failed to execute goal org.apache.maven.plugins:maven-enforcer-plugin:3.0.0-M2:enforce (enforce-versions) on project spark-parent_2.12: Some Enforcer rules have failed. Look above for specific messages explaining why the rule failed. -> [Help 1]


十二 部署Flink服务

  1. 下载软件并解压到安装目录,配置环境变量

  2. 配置flink配置文件

    taskmanager.numberOfTaskSlots: 4
  3. 测试flink

十三 部署Doris服务

  1. 下载软件并解压到安装目录,配置环境变量

  2. FE配置

    • 创建元数据目录

      % mkdir -p /opt/data/doris/doris-meta
      % mkdir -p /opt/data/doris/fe-log
    • 编辑配置文件[同步各个节点]

      LOG_DIR = /opt/data/doris/fe-log
      meta_dir = /opt/data/doris/doris-meta
      priority_networks = hadoop-node1-ip/24
      ## FE涉及端口号
      http_port = 18030
      edit_log_port = 19010
      rpc_port = 19020
      query_port = 19030
    • 启动FE

      % fe/bin/start_fe.sh --daemon
    • FE添加BE

      % mysql -hhadoop-node1 -uroot -p -P19030
      mysql > ALTER SYSTEM ADD BACKEND "hadoop-node1:19050";
      mysql > ALTER SYSTEM ADD BACKEND "hadoop-node2:19050";
      mysql > ALTER SYSTEM ADD BACKEND "hadoop-node3:19050";
      mysql > SHOW PROC '/frontends';
      mysql > SHOW PROC '/backends';
  3. BE配置

    • 创建数据目录

      % mkdir -p /opt/data/doris/doris-data
      % mkdir -p /opt/data/doris/be-log
    • 编辑配置文件[同步各个节点]

      priority_networks = hadoop-node1-ip/24
      storage_root_path = /opt/data/doris/doris-data
      ## BE涉及端口号
      be_port = 19060
      webserver_port = 18040
      heartbeat_service_port = 19050
      brpc_port = 18060
    • 启动BE

      • 添加环境变量

      sysctl -w vm.max_map_count=2000000

      ulimit -n 65536

      • 启动指令
      % be/bin/start_be.sh --daemo
  4. FE库容3个节点

    • 注册节点

      mysql > ALTER SYSTEM ADD FOLLOWER "hadoop-node2:19010";
      mysql > ALTER SYSTEM ADD FOLLOWER "hadoop-node3:19010";
    • 启动节点

      % fe/bin/start_fe.sh --helper hadoop-node1:19010 --daemon

十四 部署Clickhouse服务

  1. 安装Clickhouse

    yum install -y yum-utils
    yum-config-manager --add-repo https://packages.clickhouse.com/rpm/clickhouse.repo
    yum clean all 
    yum makecache
    yum install -y clickhouse-server clickhouse-client
  2. 启动单机版Clickhouse

    sudo /etc/init.d/clickhouse-server start
    clickhouse-client # or "clickhouse-client --password" if you set up a password.
    sudo -u 'clickhouse' /usr/bin/clickhouse-server \
    --config-file /etc/clickhouse-server/config.xml \
    --pid-file /var/run/clickhouse-server/clickhouse-server.pid \
  3. Clickhouse集群配置

    • 3.1 集群各个机器修改配置文件config.xml

      vim /etc/clickhouse-server/config.xml

    • 3.2 在每台机器的etc目录下新建metrika.xml文件,并且编辑如下内容:

          <!-- cluster config:单副本配置 -->
                  <!-- data shard one -->
                  <!-- data shard two -->
                  <!-- data shard three -->
          <!-- zookeeper config -->
              <node index="1">
          <!-- 本地节点副本名称 -->
              <!-- gawyn_cluster_node1/gawyn_cluster_node2/gawyn_cluster_node3 -->
          <!-- 监听网络 允许任何地址访问 --> 
          <!-- 数据压缩算法 --> 
    • 3.3 每台机器分别配置users.xml

          <!-- See also the files in users.d directory where the settings can be overridden. -->
          <!-- Profiles of settings. -->
              <!-- Default settings. -->
              <!-- Profile that allows only read queries. -->
          <!-- Users and ACL. -->
              <!-- If user name was not specified, 'default' user is used. -->
                  <!-- See also the files in users.d directory where the password can be overridden.
                       Password could be specified in plaintext or in SHA256 (in hex format).
                       If you want to specify password in plaintext (not recommended), place it in 'password' element.
                       Example: <password>qwerty</password>.
                       Password could be empty.
                       If you want to specify SHA256, place it in 'password_sha256_hex' element.
                       Example: <password_sha256_hex>65e84be33532fb784c48129675f9eff3a682b27168c0ea744b2cf58ee02337c5</password_sha256_hex>
                       Restrictions of SHA256: impossibility to connect to ClickHouse using MySQL JS client (as of July 2019).
                       If you want to specify double SHA1, place it in 'password_double_sha1_hex' element.
                       Example: <password_double_sha1_hex>e395796d6546b1b65db9d665cd43f0e858dd4303</password_double_sha1_hex>
                       If you want to specify a previously defined LDAP server (see 'ldap_servers' in the main config) for authentication,
                        place its name in 'server' element inside 'ldap' element.
                       Example: <ldap><server>my_ldap_server</server></ldap>
                       If you want to authenticate the user via Kerberos (assuming Kerberos is enabled, see 'kerberos' in the main config),
                        place 'kerberos' element instead of 'password' (and similar) elements.
                       The name part of the canonical principal name of the initiator must match the user name for authentication to succeed.
                       You can also place 'realm' element inside 'kerberos' element to further restrict authentication to only those requests
                        whose initiator's realm matches it.
                       Example: <kerberos />
                       Example: <kerberos><realm>EXAMPLE.COM</realm></kerberos>
                       How to generate decent password:
                       Execute: PASSWORD=$(base64 < /dev/urandom | head -c8); echo "$PASSWORD"; echo -n "$PASSWORD" | sha256sum | tr -d '-'
                       In first line will be password and in second - corresponding SHA256.
                       How to generate double SHA1:
                       Execute: PASSWORD=$(base64 < /dev/urandom | head -c8); echo "$PASSWORD"; echo -n "$PASSWORD" | sha1sum | tr -d '-' | xxd -r -p | sha1sum | tr -d '-'
                       In first line will be password and in second - corresponding double SHA1.
                  <!-- List of networks with open access.
                       To open access from everywhere, specify:
                       To open access only from localhost, specify:
                       Each element of list has one of the following forms:
                       <ip> IP-address or network mask. Examples: or or
                           2a02:6b8::3 or 2a02:6b8::3/64 or 2a02:6b8::3/ffff:ffff:ffff:ffff::.
                       <host> Hostname. Example: server01.clickhouse.com.
                           To check access, DNS query is performed, and all received addresses compared to peer address.
                       <host_regexp> Regular expression for host names. Example, ^server\d\d-\d\d-\d\.clickhouse\.com$
                           To check access, DNS PTR query is performed for peer address and then regexp is applied.
                           Then, for result of PTR query, another DNS query is performed and all received addresses compared to peer address.
                           Strongly recommended that regexp is ends with $
                       All results of DNS requests are cached till server restart.
                  <!-- Settings profile for user. -->
                  <!-- Quota for user. -->
                  <!-- User can create other users and grant rights to them. -->
                  <!-- <access_management>1</access_management> -->
          <!-- Quotas. -->
              <!-- Name of quota. -->
                  <!-- Limits for time interval. You could specify many intervals with different limits. -->
                      <!-- Length of interval. -->
                      <!-- No limits. Just calculate resource usage for time interval. -->
    • 3.4 启动clickhouse服务


      mdkir -p /opt/data/clickhouse/tmp


      sudo /etc/init.d/clickhouse-server start
    • 3.5 客户端登陆


十五 编译Hudi

  1. 从github下载hudi源码

  2. 环境准备

    • Java&maven环境

      ~ % echo $JAVA_HOME

      % echo $MAVEN_HOME


    • 添加下面Kafka依赖到maven仓库


  3. 执行Hudi编译&验证是否编译成功

    mvn clean package -DskipTests \
    -Dspark3.3 -Dscala-2.12	\
    -Dflink1.16 -Dscala-2.12 \
    -Dhadoop.version=2.10.1 \


    # hudi-cli/hudi-cli.sh


