1. 环境准备

1.1 系统要求

  • 操作系统:Linux(推荐CentOS 7+/Ubuntu 18.04+)
  • Java版本:JDK 8或JDK 11
  • 内存:最小4GB,推荐8GB+
  • 存储:至少50GB可用空间
  • 网络:集群节点间网络互通

1.2 主机规划

节点类型     | 主机名      | IP地址        | 角色
------------|------------|---------------|------------------
NameNode    | master     | 192.168.1.10  | NameNode, ResourceManager
DataNode    | slave1     | 192.168.1.11  | DataNode, NodeManager
DataNode    | slave2     | 192.168.1.12  | DataNode, NodeManager
DataNode    | slave3     | 192.168.1.13  | DataNode, NodeManager

1.3 基础环境配置

# 1. 设置主机名
sudo hostnamectl set-hostname master  # 在master节点执行
sudo hostnamectl set-hostname slave1  # 在slave1节点执行

# 2. 配置hosts文件
sudo vim /etc/hosts
# 添加以下内容
192.168.1.10 master
192.168.1.11 slave1
192.168.1.12 slave2
192.168.1.13 slave3

# 3. 关闭防火墙
sudo systemctl stop firewalld
sudo systemctl disable firewalld

# 4. 关闭SELinux
sudo setenforce 0
sudo sed -i 's/SELINUX=enforcing/SELINUX=disabled/g' /etc/selinux/config

2. Java环境安装

2.1 安装OpenJDK

# CentOS/RHEL
sudo yum install -y java-1.8.0-openjdk java-1.8.0-openjdk-devel

# Ubuntu/Debian
sudo apt update
sudo apt install -y openjdk-8-jdk

2.2 配置Java环境变量

# 查找Java安装路径
java -version
readlink -f $(which java)

# 配置环境变量
sudo vim /etc/profile
# 添加以下内容
export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk
export PATH=$PATH:$JAVA_HOME/bin
export CLASSPATH=.:$JAVA_HOME/lib/dt.jar:$JAVA_HOME/lib/tools.jar

# 使配置生效
source /etc/profile

# 验证安装
java -version
javac -version
echo $JAVA_HOME

3. SSH免密登录配置

3.1 生成SSH密钥

# 在master节点执行
ssh-keygen -t rsa -P '' -f ~/.ssh/id_rsa

# 将公钥复制到所有节点
ssh-copy-id master
ssh-copy-id slave1
ssh-copy-id slave2
ssh-copy-id slave3

3.2 验证免密登录

# 测试SSH连接
ssh master
ssh slave1
ssh slave2
ssh slave3

4. Hadoop安装

4.1 下载Hadoop

# 下载Hadoop 3.3.4
cd /opt
sudo wget https://archive.apache.org/dist/hadoop/common/hadoop-3.3.4/hadoop-3.3.4.tar.gz

# 解压
sudo tar -xzf hadoop-3.3.4.tar.gz
sudo mv hadoop-3.3.4 hadoop
sudo chown -R $(whoami):$(whoami) /opt/hadoop

4.2 配置Hadoop环境变量

# 编辑环境变量文件
sudo vim /etc/profile
# 添加以下内容
export HADOOP_HOME=/opt/hadoop
export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
export HADOOP_MAPRED_HOME=$HADOOP_HOME
export HADOOP_COMMON_HOME=$HADOOP_HOME
export HADOOP_HDFS_HOME=$HADOOP_HOME
export YARN_HOME=$HADOOP_HOME

# 使配置生效
source /etc/profile

# 验证安装
hadoop version

5. Hadoop配置文件

5.1 hadoop-env.sh配置

# 编辑hadoop-env.sh
vim $HADOOP_HOME/etc/hadoop/hadoop-env.sh

# 设置JAVA_HOME
export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk

# 设置Hadoop用户
export HDFS_NAMENODE_USER="hadoop"
export HDFS_DATANODE_USER="hadoop"
export HDFS_SECONDARYNAMENODE_USER="hadoop"
export YARN_RESOURCEMANAGER_USER="hadoop"
export YARN_NODEMANAGER_USER="hadoop"

5.2 core-site.xml配置

<!-- $HADOOP_HOME/etc/hadoop/core-site.xml -->
<configuration>
    <!-- 指定NameNode的地址 -->
    <property>
        <name>fs.defaultFS</name>
        <value>hdfs://master:9000</value>
    </property>
    
    <!-- 指定Hadoop临时目录 -->
    <property>
        <name>hadoop.tmp.dir</name>
        <value>/opt/hadoop/tmp</value>
    </property>
    
    <!-- 缓冲区大小 -->
    <property>
        <name>io.file.buffer.size</name>
        <value>131072</value>
    </property>
    
    <!-- 代理用户配置 -->
    <property>
        <name>hadoop.proxyuser.hadoop.hosts</name>
        <value>*</value>
    </property>
    <property>
        <name>hadoop.proxyuser.hadoop.groups</name>
        <value>*</value>
    </property>
</configuration>

5.3 hdfs-site.xml配置

<!-- $HADOOP_HOME/etc/hadoop/hdfs-site.xml -->
<configuration>
    <!-- 副本数量 -->
    <property>
        <name>dfs.replication</name>
        <value>3</value>
    </property>
    
    <!-- NameNode数据目录 -->
    <property>
        <name>dfs.namenode.name.dir</name>
        <value>/opt/hadoop/data/namenode</value>
    </property>
    
    <!-- DataNode数据目录 -->
    <property>
        <name>dfs.datanode.data.dir</name>
        <value>/opt/hadoop/data/datanode</value>
    </property>
    
    <!-- 块大小 -->
    <property>
        <name>dfs.blocksize</name>
        <value>134217728</value>
    </property>
    
    <!-- NameNode HTTP地址 -->
    <property>
        <name>dfs.namenode.http-address</name>
        <value>master:9870</value>
    </property>
    
    <!-- SecondaryNameNode HTTP地址 -->
    <property>
        <name>dfs.namenode.secondary.http-address</name>
        <value>master:9868</value>
    </property>
    
    <!-- 权限检查 -->
    <property>
        <name>dfs.permissions.enabled</name>
        <value>false</value>
    </property>
</configuration>

5.4 mapred-site.xml配置

<!-- $HADOOP_HOME/etc/hadoop/mapred-site.xml -->
<configuration>
    <!-- 指定MapReduce框架为YARN -->
    <property>
        <name>mapreduce.framework.name</name>
        <value>yarn</value>
    </property>
    
    <!-- JobHistory Server地址 -->
    <property>
        <name>mapreduce.jobhistory.address</name>
        <value>master:10020</value>
    </property>
    
    <!-- JobHistory Server Web地址 -->
    <property>
        <name>mapreduce.jobhistory.webapp.address</name>
        <value>master:19888</value>
    </property>
    
    <!-- Map任务内存 -->
    <property>
        <name>mapreduce.map.memory.mb</name>
        <value>1024</value>
    </property>
    
    <!-- Reduce任务内存 -->
    <property>
        <name>mapreduce.reduce.memory.mb</name>
        <value>2048</value>
    </property>
    
    <!-- Map任务JVM参数 -->
    <property>
        <name>mapreduce.map.java.opts</name>
        <value>-Xmx819m</value>
    </property>
    
    <!-- Reduce任务JVM参数 -->
    <property>
        <name>mapreduce.reduce.java.opts</name>
        <value>-Xmx1638m</value>
    </property>
</configuration>

5.5 yarn-site.xml配置

<!-- $HADOOP_HOME/etc/hadoop/yarn-site.xml -->
<configuration>
    <!-- ResourceManager地址 -->
    <property>
        <name>yarn.resourcemanager.hostname</name>
        <value>master</value>
    </property>
    
    <!-- NodeManager辅助服务 -->
    <property>
        <name>yarn.nodemanager.aux-services</name>
        <value>mapreduce_shuffle</value>
    </property>
    
    <!-- NodeManager内存 -->
    <property>
        <name>yarn.nodemanager.resource.memory-mb</name>
        <value>4096</value>
    </property>
    
    <!-- NodeManager CPU核数 -->
    <property>
        <name>yarn.nodemanager.resource.cpu-vcores</name>
        <value>4</value>
    </property>
    
    <!-- 容器最小内存 -->
    <property>
        <name>yarn.scheduler.minimum-allocation-mb</name>
        <value>512</value>
    </property>
    
    <!-- 容器最大内存 -->
    <property>
        <name>yarn.scheduler.maximum-allocation-mb</name>
        <value>4096</value>
    </property>
    
    <!-- ResourceManager Web地址 -->
    <property>
        <name>yarn.resourcemanager.webapp.address</name>
        <value>master:8088</value>
    </property>
    
    <!-- 日志聚合 -->
    <property>
        <name>yarn.log-aggregation-enable</name>
        <value>true</value>
    </property>
    
    <!-- 日志保留时间 -->
    <property>
        <name>yarn.log-aggregation.retain-seconds</name>
        <value>604800</value>
    </property>
</configuration>

5.6 workers文件配置

# 编辑workers文件
vim $HADOOP_HOME/etc/hadoop/workers

# 添加DataNode节点
slave1
slave2
slave3

6. 分发配置文件

6.1 创建分发脚本

#!/bin/bash
# distribute.sh - 分发Hadoop到所有节点

HADOOP_HOME=/opt/hadoop
NODES="slave1 slave2 slave3"

for node in $NODES; do
    echo "正在分发到 $node..."
    scp -r $HADOOP_HOME $node:/opt/
    scp /etc/profile $node:/etc/
    ssh $node "source /etc/profile"
done

echo "分发完成!"

6.2 执行分发

chmod +x distribute.sh
./distribute.sh

7. 启动Hadoop集群

7.1 格式化NameNode

# 创建数据目录
mkdir -p /opt/hadoop/data/namenode
mkdir -p /opt/hadoop/data/datanode
mkdir -p /opt/hadoop/tmp

# 格式化NameNode(只在第一次启动时执行)
hdfs namenode -format

7.2 启动HDFS

# 启动HDFS
start-dfs.sh

# 检查进程
jps
# 应该看到:NameNode, DataNode, SecondaryNameNode

7.3 启动YARN

# 启动YARN
start-yarn.sh

# 检查进程
jps
# 应该看到:ResourceManager, NodeManager

7.4 启动JobHistory Server

# 启动JobHistory Server
mapred --daemon start historyserver

# 检查进程
jps
# 应该看到:JobHistoryServer

8. 验证集群状态

8.1 HDFS状态检查

# 查看HDFS状态
hdfs dfsadmin -report

# 查看文件系统
hdfs dfs -ls /

# 创建测试目录
hdfs dfs -mkdir /test
hdfs dfs -put /etc/passwd /test/

8.2 YARN状态检查

# 查看YARN节点状态
yarn node -list

# 查看队列信息
yarn queue -status default

8.3 Web界面访问

NameNode Web UI:     http://master:9870
ResourceManager UI:  http://master:8088
JobHistory Server:   http://master:19888

9. 运行测试作业

9.1 运行示例程序

# 创建输入目录
hdfs dfs -mkdir /input

# 上传测试文件
hdfs dfs -put /etc/passwd /input/

# 运行WordCount示例
hadoop jar $HADOOP_HOME/share/hadoop/mapreduce/hadoop-mapreduce-examples-3.3.4.jar \
    wordcount /input /output

# 查看结果
hdfs dfs -cat /output/part-r-00000

9.2 性能测试

# 运行TeraGen生成测试数据
hadoop jar $HADOOP_HOME/share/hadoop/mapreduce/hadoop-mapreduce-examples-3.3.4.jar \
    teragen 1000000 /teragen-output

# 运行TeraSort排序测试
hadoop jar $HADOOP_HOME/share/hadoop/mapreduce/hadoop-mapreduce-examples-3.3.4.jar \
    terasort /teragen-output /terasort-output

# 验证排序结果
hadoop jar $HADOOP_HOME/share/hadoop/mapreduce/hadoop-mapreduce-examples-3.3.4.jar \
    teravalidate /terasort-output /teravalidate-output

10. 集群管理脚本

10.1 启动脚本

#!/bin/bash
# start-cluster.sh

echo "启动Hadoop集群..."

# 启动HDFS
echo "启动HDFS..."
start-dfs.sh

# 启动YARN
echo "启动YARN..."
start-yarn.sh

# 启动JobHistory Server
echo "启动JobHistory Server..."
mapred --daemon start historyserver

echo "集群启动完成!"
echo "NameNode Web UI: http://master:9870"
echo "ResourceManager UI: http://master:8088"
echo "JobHistory Server: http://master:19888"

10.2 停止脚本

#!/bin/bash
# stop-cluster.sh

echo "停止Hadoop集群..."

# 停止JobHistory Server
echo "停止JobHistory Server..."
mapred --daemon stop historyserver

# 停止YARN
echo "停止YARN..."
stop-yarn.sh

# 停止HDFS
echo "停止HDFS..."
stop-dfs.sh

echo "集群停止完成!"

10.3 状态检查脚本

#!/bin/bash
# check-cluster.sh

echo "=== Hadoop集群状态检查 ==="

echo "\n1. Java进程状态:"
jps

echo "\n2. HDFS状态:"
hdfs dfsadmin -report | head -20

echo "\n3. YARN节点状态:"
yarn node -list

echo "\n4. 磁盘使用情况:"
df -h | grep -E "(Filesystem|/opt)"

echo "\n5. 内存使用情况:"
free -h

11. 故障排除

11.1 常见问题

NameNode启动失败

# 检查日志
tail -f $HADOOP_HOME/logs/hadoop-*-namenode-*.log

# 常见原因:
# 1. 端口被占用
netstat -tlnp | grep 9000

# 2. 权限问题
chown -R hadoop:hadoop /opt/hadoop

# 3. 磁盘空间不足
df -h

DataNode无法连接NameNode

# 检查网络连通性
telnet master 9000

# 检查防火墙
sudo systemctl status firewalld

# 检查hosts文件
cat /etc/hosts

11.2 日志分析

# 查看各组件日志
tail -f $HADOOP_HOME/logs/hadoop-*-namenode-*.log
tail -f $HADOOP_HOME/logs/hadoop-*-datanode-*.log
tail -f $HADOOP_HOME/logs/yarn-*-resourcemanager-*.log
tail -f $HADOOP_HOME/logs/yarn-*-nodemanager-*.log

11.3 性能监控

# 监控系统资源
top
iostat -x 1
sar -u 1

# 监控Hadoop指标
hadoop dfsadmin -report
yarn top

小结

本章详细介绍了Hadoop集群的搭建过程,包括环境准备、软件安装、配置文件设置和集群启动验证。正确的环境搭建是后续MapReduce开发和运行的基础。

下一章将介绍MapReduce程序的开发和调试方法。