虚拟机规划
--无需配置/etc/hosts 10.37.129.3 node1 10.37.129.4 node2 10.37.129.5 node3 10.37.129.6 vip
-- 所有节点安装配置
一、系统环境准备
关闭防火墙
systemctl stop firewalld systemctl disable firewalld
关闭selinux
sed -i 's/SELINUX=.*/SELINUX=disabled/g' /etc/selinux/config # 临时关闭selinux setenforce 0
安装配置NTP
yum install -y chrony # 时间同步对于集群环境很重要 sed ’/^server/d' /etc/chrony.conf echo 'server s1a.time.edu.cn iburst' >> /etc/chrony.conf
启动NTP服务
systemctl start chronyd systenctl enable chronyd
二、各类软件安装
安装PostgreSQL软件
# 仅需要安装软件,安装过程会自动创建postgres用户,但是不包含用户密码 yum install -y postgresql12 postgresql12-server postsedgresql12-libs postgresql12-contrib # 修改postgres用户密码 echo '123456'|passwd postgres --stdin
安装python3以及依赖包
yum install -y python3 python-psycopg2 python3-devel
安装Etcd软件
yum install -y etcd
安装patroni软件
# patroni软件使用python3编写,因此需要使用pip安装 pip3 install psycopg2-binary pip3 install patroni[etcd]
三、配置sudo权限
$ visudo ## Allow root to run any commands anywhere postgres ALL=(root) NOPASSWD: ALL
四、Etcd集群配置
Etcd配置安装参考另外一篇博客《Etcd集群静态配置》,安装配置较为简单 # 启动Etcd systemctl start etcd # 命令执行后,直接去其他节点启动,节点间通信成功后才可以正常启动,不然会报超时错误 systemctl enable etcd # 启用开启启动 # 查看Etcd状态 $ etcdctl member list 49bab10dd5347fa2, started, etcd2, 97cdb5480a6be19f, started, etcd3, a2916ae27f29d955, started, etcd1,
五、创建patroni服务
# 为了更便于管理,配置systemd服务进行管理,在patroni源码包中有patroni.service示例文件可以参考 $ vi /usr/lib/systemd/system/patroni.service [Unit] Description=Runners to orchestrate a high-availability PostgreSQL After=syslog.target network.target [Service] Type=simple User=postgres Group=postgres EnvironmentFile=-/etc/patroni/patroni_env.conf ExecStartPre=-/usr/bin/sudo /sbin/modprobe softdog # 使用watchdog进行服务监控 ExecStartPre=-/usr/bin/sudo /bin/chown postgres /dev/watchdog # 使用postgres用户管理,需要sudo ExecStart=/usr/local/bin/patroni /etc/patroni/patroni.yml # 注意纠正patroni命令的路径 ExecReload=/bin/kill -s HUP $MAINPID KillMode=process TimeoutSec=30 Restart=no [Install] WantedBy=multi-user.target # 重新加载systemd服务 $ systemctl daemon-reload
六、配置patroni
# node1 # 配置文件路径在前面systemd服务中已经定义 # 文件格式是yaml文本,不支持使用tab制表位,会提示语法错误 $ vi /etc/patroni/patroni.yml scope: pgsql namespace: /service/ # Etcd中键值位置 name: pg1 # patroni名称,每个节点不一样 restapi: listen: 0.0.0.0:8008 # 保持默认,监听所有的8008端口 connect_address: 10.37.129.3:8008 # 本地连接通信 etcd3: # 这里建议使用etcdv3,默认是etcdv2,默认写入到etcd中的键值都是不可见的(仅patroni如此) hosts: 10.37.129.3:2379,10.37.129.4:2379,10.37.129.5:2379 # Etcd地址,如果这里使用单节点的etcd,需要将hosts关键字替换为host bootstrap: dcs: ttl: 30 loop_wait: 10 retry_timeout: 10 maximum_lag_on_failover: 1048576 # 如果延迟超过1048576,不允许failover master_start_timeout: 300 synchronous_mode: false # 异步复制 postgresql: # 以下是pgsql服务的特性即参数配置,不详述 use_pg_rewind: true use_slots: true parameters: listen_addresses: "0.0.0.0" port: 5432 wal_level: replica hot_standby: "on" wal_keep_segments: 100 max_wal_senders: 10 max_replication_slots: 10 wal_log_hints: "on" archive_mode: "on" archive_timeout: 1800s archive_command: mkdir -p ../wal_archive && test ! -f ../wal_archive/%f && cp %p ../wal_archive/%f recovery_conf: restore_command: cp ../wal_archive/%f %p initdb: - encoding: UTF8 - locale: C - lc-ctype: zh_CN.UTF-8 - data-checksums pg_hba: # 定义流复制用户和远程连接身份鉴别设置 - host replication postgres 10.37.129.0/24 md5 - host all all 0.0.0.0/0 md5 postgresql: listen: 0.0.0.0:5432 connect_address: 10.37.129.3:5432 # 连接pgsql服务的配置,这里不能使用127.0.0.1,pg_basebackup需要远程连接主库进行在线复制 data_dir: /var/lib/pgsql/12/data # $PGDATA bin_dir: /usr/pgsql-12/bin # $PGHOME/bin authentication: replication: username: postgres password: "123456" superuser: username: postgres password: "123456" rewind: username: postgres password: "123456" basebackup: max-rate: 100M checkpoint: fast callbacks: # 本次配置没有使用haproxy+keepalived实现VIP切换和负载均衡,因为callbacks方式更快速,对系统资源消耗更小,操作更简单,脚本后面提供 on_start: /bin/bash /etc/patroni/patroni_callback.sh # patroni服务启动时候的触发的操作 on_stop: /bin/bash /etc/patroni/patroni_callback.sh # patroni服务停止时候触发的操作 on_role_change: /bin/bash /etc/patroni/patroni_callback.sh # patroni服务角色切换时触发的操作 watchdog: # 使用linux自带的软件watchdog监控patroni的服务持续性 mode: automatic # Allowed values: off, automatic, required device: /dev/watchdog # watchdog设备,/dev/watchdog和/dev/watchdog0等同,可能存在兼容性区别 safety_margin: 5 tags: nofailover: false # 是否执行自动切换 noloadbalance: false # 是否开启负载均衡 clonefrom: false nosync: false
# node2 # 修改 name: pg2 restapi: listen: 0.0.0.0:8008 connect_address: 10.37.129.4:8008 postgresql: listen: 0.0.0.0:5432 connect_address: 10.37.129.4:5432
# node3 # 修改 name: pg3 restapi: listen: 0.0.0.0:8008 connect_address: 10.37.129.5:8008 postgresql: listen: 0.0.0.0:5432 connect_address: 10.37.129.5:5432
七、安装watchdog
# 安装软件,linux内置功能 yum install -y watchdog # 初始化watchdog字符设备 modprobe softdog # 修改/dev/watchdog设备权限 chmod 666 /dev/watchdog # 启动watchdog服务 systemctl start watchdog systemctl enable watchdog
八、创建patroni_callback脚本
# 脚本的开头传入了三个变量,但是在patroni.yml文件中我们并没有传入任何的变量,实际测试过程中发现由patroni服务默认传入三个变量 $1 - action, patroni触发的动作,stop/start/on_role_change/restart/reload $2 - role, 当前节点的角色,master/{slave|replica} $3 - scope, 作用范围,pgsql服务 # 脚本来自于其他博客,由于逻辑很简单,直接引用了 # 当节点角色为主,使用ip addr命令绑定VIP地址 # 当节点角色为备,使用ip addr命令解绑VIP地址 $ vi /etc/patroni/patroni_callback.sh #!/bin/bash readonly action=$1 readonly role=$2 readonly scope=$3 function usage() { echo "Usage: $0" exit 1 } echo "this is patroni callback $action $role $scope" case $cb_name in on_stop) sudo ip addr del 10.37.129.6/24 dev eth1 label eth1:1 ;; on_start) ;; on_role_change) if [[ $role == 'master' ]]; then # 绑定VIP sudo ip addr add 10.37.129.6/24 brd 10.37.129.255 dev eth1 label eth1:1 # 监测VIP冲突,并屏蔽冲突的IP sudo arping -q -A -c 1 -I eth1 10.37.129.6 else sudo ip addr del 10.37.129.6/24 dev eth1 label eth1:1 fi ;; *) usage ;; esac
九、启动patroni服务
# 启动服务 systemctl start patroni # 服务启动会自动进行数据库的初始化和备库的创建 systemctl enable patroni # 查看状态 $ patronictl -c /etc/patroni/patroni.yml list + Cluster: pgsql (6942702405679816489) ----+----+-----------+ | Member | Host | Role | State | TL | Lag in MB | +--------+-------------+---------+---------+----+-----------+ | pg1 | 10.37.129.3 | Replica | running | 2 | 0 | | pg2 | 10.37.129.4 | Leader | running | 2 | | | pg3 | 10.37.129.5 | Replica | running | 2 | 0 | +--------+-------------+---------+---------+----+-----------+ # 查看VIP [root@node2 ~]# ip -o -4 a 1: lo inet 127.0.0.1/8 scope host lo\ valid_lft forever preferred_lft forever 2: eth0 inet 10.211.55.11/24 brd 10.211.55.255 scope global noprefixroute dynamic eth0\ valid_lft 1766sec preferred_lft 1766sec 3: eth1 inet 10.37.129.4/24 brd 10.37.129.255 scope global noprefixroute eth1\ valid_lft forever preferred_lft forever 3: eth1 inet 10.37.129.6/24 brd 10.37.129.255 scope global secondary eth1:1\ valid_lft forever preferred_lft forever 4: eth2 inet 192.168.56.4/24 brd 192.168.56.255 scope global noprefixroute eth2\ valid_lft forever preferred_lft forever
十、手动触发故障切换
# PostgreSQL默认使用的异步复制,当postgres服务异常停止后并不会触发主备自动切换,而是patroni尝试拉起down掉的服务,如果未正常拉起服务,则执行故障切换 # 关闭patroni服务 $ systemctl stop patroni # 查看patroni服务 [root@node3 ~]# patronictl -c /etc/patroni/patroni.yml list + Cluster: pgsql (6942702405679816489) ----+----+-----------+ | Member | Host | Role | State | TL | Lag in MB | +--------+-------------+---------+---------+----+-----------+ | pg1 | 10.37.129.3 | Replica | running | 4 | 0 | | pg2 | 10.37.129.4 | Replica | stopped | | unknown | | pg3 | 10.37.129.5 | Leader | running | 4 | | +--------+-------------+---------+---------+----+-----------+ # 查看VIP [root@node3 ~]# ip -o -4 a 1: lo inet 127.0.0.1/8 scope host lo\ valid_lft forever preferred_lft forever 2: eth0 inet 10.211.55.12/24 brd 10.211.55.255 scope global noprefixroute dynamic eth0\ valid_lft 1224sec preferred_lft 1224sec 3: eth1 inet 10.37.129.5/24 brd 10.37.129.255 scope global noprefixroute eth1\ valid_lft forever preferred_lft forever 3: eth1 inet 10.37.129.6/24 brd 10.37.129.255 scope global secondary eth1:1\ valid_lft forever preferred_lft forever 4: eth2 inet 192.168.56.5/24 brd 192.168.56.255 scope global noprefixroute eth2\ valid_lft forever preferred_lft forever # 管理集群 $ patronictl --help