7.3 高可用性とスケーリング
エンタープライズ環境での継続的な監視サービスを実現するZabbix高可用性とスケーリング戦略
概要
高可用性とスケーリングは、Zabbixを企業の重要なインフラストラクチャとして運用するために不可欠な要件です。適切に設計された高可用性アーキテクチャにより、単一障害点の除去、性能向上、そして将来の成長に対応できる拡張性を実現できます。
高可用性の価値
要素 | 効果 | 適用場面 |
---|---|---|
冗長性 | 単一障害点の除去 | ミッションクリティカル環境 |
負荷分散 | 性能向上・安定性 | 大規模監視環境 |
自動フェイルオーバー | ダウンタイム最小化 | 24/7運用環境 |
水平スケーリング | 容量拡張 | 成長する監視要件 |
災害復旧 | 事業継続性 | 地理的分散環境 |
Zabbixクラスタリングアーキテクチャ
HA Nodeを使用したクラスタ構成
基本的なHA Node設定
Zabbix 6.0以降で導入されたHA Nodeにより、アクティブ-パッシブクラスタ構成が可能です。
bash
# Zabbix Server HA設定
# /etc/zabbix/zabbix_server.conf
# HA Node設定
HANodeName=zabbix-node1
NodeAddress=192.168.1.10:10051
# データベース設定
DBHost=192.168.1.20
DBName=zabbix
DBUser=zabbix
DBPassword=secure_password
# HA特有の設定
StartPollers=100
StartPollersUnreachable=50
StartTrappers=50
StartDBSyncers=16
CacheSize=2G
HistoryCacheSize=1G
TrendCacheSize=512M
ValueCacheSize=4G
複数ノード構成例
yaml
# docker-compose.yml for Zabbix HA Cluster
version: '3.8'
services:
zabbix-server-node1:
image: zabbix/zabbix-server-mysql:latest
container_name: zabbix-server-node1
environment:
- DB_SERVER_HOST=mysql-primary
- MYSQL_DATABASE=zabbix
- MYSQL_USER=zabbix
- MYSQL_PASSWORD=zabbix_password
- ZBX_HANODENAME=zabbix-node1
- ZBX_NODEADDRESS=zabbix-server-node1:10051
- ZBX_STARTPOLLERS=100
- ZBX_STARTTRAPPERS=50
- ZBX_CACHESIZE=2G
- ZBX_HISTORYCACHESIZE=1G
volumes:
- /etc/localtime:/etc/localtime:ro
- zabbix-server-node1-data:/var/lib/zabbix
networks:
- zabbix-cluster
depends_on:
- mysql-primary
restart: unless-stopped
zabbix-server-node2:
image: zabbix/zabbix-server-mysql:latest
container_name: zabbix-server-node2
environment:
- DB_SERVER_HOST=mysql-primary
- MYSQL_DATABASE=zabbix
- MYSQL_USER=zabbix
- MYSQL_PASSWORD=zabbix_password
- ZBX_HANODENAME=zabbix-node2
- ZBX_NODEADDRESS=zabbix-server-node2:10051
- ZBX_STARTPOLLERS=100
- ZBX_STARTTRAPPERS=50
- ZBX_CACHESIZE=2G
- ZBX_HISTORYCACHESIZE=1G
volumes:
- /etc/localtime:/etc/localtime:ro
- zabbix-server-node2-data:/var/lib/zabbix
networks:
- zabbix-cluster
depends_on:
- mysql-primary
restart: unless-stopped
mysql-primary:
image: mysql:8.0
container_name: mysql-primary
environment:
- MYSQL_DATABASE=zabbix
- MYSQL_USER=zabbix
- MYSQL_PASSWORD=zabbix_password
- MYSQL_ROOT_PASSWORD=root_password
command:
- mysqld
- --character-set-server=utf8mb4
- --collation-server=utf8mb4_bin
- --default-authentication-plugin=mysql_native_password
- --log-bin=mysql-bin
- --server-id=1
- --gtid-mode=ON
- --enforce-gtid-consistency=ON
volumes:
- mysql-primary-data:/var/lib/mysql
- ./mysql-conf:/etc/mysql/conf.d
networks:
- zabbix-cluster
restart: unless-stopped
networks:
zabbix-cluster:
driver: bridge
volumes:
zabbix-server-node1-data:
zabbix-server-node2-data:
mysql-primary-data:
Keepalived を使用したフェイルオーバー
Keepalived設定例
bash
# /etc/keepalived/keepalived.conf (Node1)
vrrp_script chk_zabbix {
script "/usr/local/bin/check_zabbix.sh"
interval 2
weight -2
fall 3
rise 2
}
vrrp_instance VI_1 {
state MASTER
interface eth0
virtual_router_id 51
priority 110
advert_int 1
authentication {
auth_type PASS
auth_pass zabbix_vrrp_pass
}
virtual_ipaddress {
192.168.1.100/24
}
track_script {
chk_zabbix
}
notify_master "/usr/local/bin/zabbix_master.sh"
notify_backup "/usr/local/bin/zabbix_backup.sh"
notify_fault "/usr/local/bin/zabbix_fault.sh"
}
Zabbixサービスチェックスクリプト
bash
#!/bin/bash
# /usr/local/bin/check_zabbix.sh
ZABBIX_SERVER_PORT=10051
ZABBIX_SERVER_HOST=localhost
# Zabbixプロセスチェック
if ! pgrep -f "zabbix_server" > /dev/null; then
echo "Zabbix server process not running"
exit 1
fi
# ポートチェック
if ! netstat -ln | grep ":${ZABBIX_SERVER_PORT}" > /dev/null; then
echo "Zabbix server port not listening"
exit 1
fi
# データベース接続チェック
DB_CHECK=$(mysql -u zabbix -ppassword zabbix -e "SELECT 1" 2>/dev/null)
if [ $? -ne 0 ]; then
echo "Database connection failed"
exit 1
fi
# API応答チェック
API_CHECK=$(curl -s -X POST \
-H "Content-Type: application/json" \
-d '{"jsonrpc":"2.0","method":"apiinfo.version","params":{},"id":1}' \
http://localhost/zabbix/api_jsonrpc.php)
if [[ ! $API_CHECK =~ "result" ]]; then
echo "API not responding"
exit 1
fi
echo "All checks passed"
exit 0
フェイルオーバースクリプト
bash
#!/bin/bash
# /usr/local/bin/zabbix_master.sh
LOG_FILE="/var/log/zabbix_ha.log"
echo "$(date): Becoming MASTER" >> $LOG_FILE
# Zabbixサーバー起動
systemctl start zabbix-server
systemctl start zabbix-agent
# Webサーバー起動
systemctl start httpd
# IPTables設定
iptables -I INPUT -p tcp --dport 10051 -j ACCEPT
iptables -I INPUT -p tcp --dport 80 -j ACCEPT
iptables -I INPUT -p tcp --dport 443 -j ACCEPT
# DNS更新(必要に応じて)
# nsupdate -k /etc/nsupdate.key << EOF
# server dns-server.example.com
# update delete zabbix.example.com A
# update add zabbix.example.com 300 A 192.168.1.100
# send
# EOF
echo "$(date): MASTER setup completed" >> $LOG_FILE
データベース高可用性
MySQL Master-Slave レプリケーション
Master設定
bash
# /etc/mysql/mysql.conf.d/mysqld.cnf (Master)
[mysqld]
# サーバーID設定
server-id = 1
# バイナリログ設定
log-bin = mysql-bin
binlog-format = ROW
expire_logs_days = 7
# GTID設定
gtid-mode = ON
enforce-gtid-consistency = ON
# レプリケーション設定
sync_binlog = 1
binlog-do-db = zabbix
# パフォーマンス設定
innodb_buffer_pool_size = 4G
innodb_log_file_size = 512M
innodb_flush_log_at_trx_commit = 1
innodb_flush_method = O_DIRECT
# Zabbix最適化
max_connections = 500
key_buffer_size = 256M
table_open_cache = 4000
sort_buffer_size = 2M
read_buffer_size = 1M
myisam_sort_buffer_size = 64M
thread_cache_size = 8
query_cache_size = 256M
thread_concurrency = 8
Slave設定とセットアップ
bash
# /etc/mysql/mysql.conf.d/mysqld.cnf (Slave)
[mysqld]
# サーバーID設定(Masterと異なる値)
server-id = 2
# リードオンリー設定
read-only = 1
# リレーログ設定
relay-log = mysql-relay-bin
relay-log-index = mysql-relay-bin.index
# GTID設定
gtid-mode = ON
enforce-gtid-consistency = ON
# レプリケーション設定
log-slave-updates = 1
replicate-do-db = zabbix
# パフォーマンス設定(Masterと同様)
innodb_buffer_pool_size = 4G
innodb_log_file_size = 512M
レプリケーション設定スクリプト
bash
#!/bin/bash
# MySQL Master-Slave Setup Script
MASTER_HOST="192.168.1.20"
SLAVE_HOST="192.168.1.21"
REPL_USER="replicator"
REPL_PASS="replication_password"
MYSQL_ROOT_PASS="root_password"
# Master側でレプリケーションユーザー作成
mysql -h $MASTER_HOST -u root -p$MYSQL_ROOT_PASS << EOF
CREATE USER '$REPL_USER'@'%' IDENTIFIED BY '$REPL_PASS';
GRANT REPLICATION SLAVE ON *.* TO '$REPL_USER'@'%';
FLUSH PRIVILEGES;
RESET MASTER;
EOF
# Zabbixデータベースダンプ
mysqldump -h $MASTER_HOST -u root -p$MYSQL_ROOT_PASS \
--single-transaction \
--routines \
--triggers \
--databases zabbix > zabbix_master_dump.sql
# Slave側でデータベース復元
mysql -h $SLAVE_HOST -u root -p$MYSQL_ROOT_PASS < zabbix_master_dump.sql
# Slave設定
mysql -h $SLAVE_HOST -u root -p$MYSQL_ROOT_PASS << EOF
CHANGE MASTER TO
MASTER_HOST='$MASTER_HOST',
MASTER_USER='$REPL_USER',
MASTER_PASSWORD='$REPL_PASS',
MASTER_AUTO_POSITION=1;
START SLAVE;
SHOW SLAVE STATUS\G
EOF
MySQL Cluster (NDB) 構成
管理ノード設定
bash
# /etc/mysql/ndb_mgmd.cnf
[ndbd default]
NoOfReplicas=2
DataMemory=2G
IndexMemory=512M
[ndb_mgmd]
hostname=192.168.1.30
datadir=/var/lib/mysql-cluster
[ndbd]
hostname=192.168.1.31
datadir=/var/lib/mysql-cluster
[ndbd]
hostname=192.168.1.32
datadir=/var/lib/mysql-cluster
[mysqld]
hostname=192.168.1.33
[mysqld]
hostname=192.168.1.34
データノード設定
bash
# /etc/mysql/my.cnf (Data Node)
[mysql_cluster]
ndb-connectstring=192.168.1.30
[ndbd]
hostname=192.168.1.31
datadir=/var/lib/mysql-cluster
フロントエンド負荷分散
HAProxy設定
bash
# /etc/haproxy/haproxy.cfg
global
daemon
maxconn 4096
log stdout local0
defaults
mode http
timeout connect 5000ms
timeout client 50000ms
timeout server 50000ms
option httplog
frontend zabbix_frontend
bind *:80
bind *:443 ssl crt /etc/ssl/certs/zabbix.pem
# セッション維持のためのCookie設定
capture cookie ZBXSESSID len 32
# セキュリティヘッダー
http-response set-header X-Frame-Options DENY
http-response set-header X-Content-Type-Options nosniff
http-response set-header X-XSS-Protection "1; mode=block"
# バックエンドへの転送
default_backend zabbix_servers
backend zabbix_servers
balance roundrobin
# セッション維持
cookie SERVERID insert indirect nocache
# ヘルスチェック
option httpchk GET /zabbix/ping.php
http-check expect status 200
# Zabbix Web サーバー
server zabbix-web1 192.168.1.41:80 check cookie web1
server zabbix-web2 192.168.1.42:80 check cookie web2
server zabbix-web3 192.168.1.43:80 check cookie web3
# 統計ページ
listen stats
bind *:8080
stats enable
stats uri /stats
stats refresh 30s
stats admin if TRUE
Nginx負荷分散設定
nginx
# /etc/nginx/nginx.conf
upstream zabbix_backend {
# セッション維持設定
ip_hash;
server 192.168.1.41:80 weight=3 max_fails=3 fail_timeout=30s;
server 192.168.1.42:80 weight=3 max_fails=3 fail_timeout=30s;
server 192.168.1.43:80 weight=2 max_fails=3 fail_timeout=30s backup;
}
server {
listen 80;
listen 443 ssl http2;
server_name zabbix.example.com;
# SSL設定
ssl_certificate /etc/ssl/certs/zabbix.crt;
ssl_certificate_key /etc/ssl/private/zabbix.key;
ssl_protocols TLSv1.2 TLSv1.3;
ssl_ciphers ECDHE-RSA-AES256-GCM-SHA512:DHE-RSA-AES256-GCM-SHA512;
# セキュリティヘッダー
add_header X-Frame-Options DENY;
add_header X-Content-Type-Options nosniff;
add_header X-XSS-Protection "1; mode=block";
add_header Strict-Transport-Security "max-age=31536000; includeSubDomains";
# 圧縮設定
gzip on;
gzip_types text/css application/javascript image/svg+xml;
# 静的ファイルのキャッシュ
location ~* \.(css|js|png|jpg|jpeg|gif|ico|svg)$ {
expires 1y;
add_header Cache-Control "public, immutable";
}
# アップストリームへプロキシ
location / {
proxy_pass http://zabbix_backend;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
# Websocket サポート
proxy_http_version 1.1;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection "upgrade";
# タイムアウト設定
proxy_connect_timeout 30s;
proxy_send_timeout 30s;
proxy_read_timeout 30s;
}
# ヘルスチェックエンドポイント
location /health {
access_log off;
return 200 "healthy\n";
add_header Content-Type text/plain;
}
}
パフォーマンス最適化
Zabbixサーバー最適化
プロセス数の最適化
bash
# /etc/zabbix/zabbix_server.conf
# ベースライン設定(小規模:~1,000ホスト)
StartPollers=50
StartPollersUnreachable=10
StartTrappers=20
StartDBSyncers=4
# 中規模設定(1,000~10,000ホスト)
StartPollers=100
StartPollersUnreachable=20
StartTrappers=50
StartDBSyncers=8
# 大規模設定(10,000ホスト以上)
StartPollers=200
StartPollersUnreachable=50
StartTrappers=100
StartDBSyncers=16
StartTimers=2
StartEscalators=2
# メモリ設定
CacheSize=4G
HistoryCacheSize=2G
TrendCacheSize=1G
ValueCacheSize=8G
# データベース接続設定
DBHost=localhost
DBName=zabbix
DBUser=zabbix
DBSocket=/var/run/mysqld/mysqld.sock
# ハウスキーピング設定
HousekeepingFrequency=1
MaxHousekeeperDelete=50000
動的設定調整スクリプト
python
#!/usr/bin/env python3
# Zabbix Server Performance Tuner
import psutil
import subprocess
import configparser
import math
class ZabbixPerformanceTuner:
def __init__(self, config_file="/etc/zabbix/zabbix_server.conf"):
self.config_file = config_file
self.system_info = self._get_system_info()
def _get_system_info(self):
"""システム情報取得"""
return {
'cpu_cores': psutil.cpu_count(),
'memory_gb': psutil.virtual_memory().total / (1024**3),
'disk_io': psutil.disk_io_counters(),
'network_io': psutil.net_io_counters()
}
def calculate_optimal_settings(self, host_count, item_count):
"""最適設定値の計算"""
cpu_cores = self.system_info['cpu_cores']
memory_gb = self.system_info['memory_gb']
# ベースライン計算
base_pollers = min(200, max(10, host_count // 20))
pollers = min(base_pollers, cpu_cores * 10)
# メモリベース設定
cache_size = min(int(memory_gb * 0.3), 8) * 1024 # MB
history_cache = min(int(memory_gb * 0.15), 4) * 1024 # MB
trend_cache = min(int(memory_gb * 0.1), 2) * 1024 # MB
value_cache = min(int(memory_gb * 0.2), 16) * 1024 # MB
# アイテム数ベース設定
trappers = min(100, max(10, item_count // 1000))
db_syncers = min(16, max(4, cpu_cores // 2))
return {
'StartPollers': pollers,
'StartPollersUnreachable': max(5, pollers // 5),
'StartTrappers': trappers,
'StartDBSyncers': db_syncers,
'CacheSize': f"{cache_size}M",
'HistoryCacheSize': f"{history_cache}M",
'TrendCacheSize': f"{trend_cache}M",
'ValueCacheSize': f"{value_cache}M"
}
def apply_settings(self, settings):
"""設定の適用"""
config = configparser.ConfigParser()
config.read(self.config_file)
# バックアップ作成
subprocess.run(['cp', self.config_file, f"{self.config_file}.backup"])
# 設定更新
if 'DEFAULT' not in config.sections():
config.add_section('DEFAULT')
for key, value in settings.items():
config.set('DEFAULT', key, str(value))
# 設定保存
with open(self.config_file, 'w') as f:
config.write(f)
print(f"Settings applied to {self.config_file}")
print("Please restart Zabbix server to apply changes")
def monitor_performance(self):
"""パフォーマンス監視"""
# Zabbixプロセス統計
zabbix_processes = []
for proc in psutil.process_iter(['pid', 'name', 'cpu_percent', 'memory_info']):
if 'zabbix' in proc.info['name']:
zabbix_processes.append(proc.info)
# システムメトリクス
cpu_usage = psutil.cpu_percent(interval=1)
memory = psutil.virtual_memory()
disk_io = psutil.disk_io_counters()
return {
'zabbix_processes': len(zabbix_processes),
'cpu_usage': cpu_usage,
'memory_usage': memory.percent,
'memory_available': memory.available / (1024**3),
'disk_read_mb': disk_io.read_bytes / (1024**2),
'disk_write_mb': disk_io.write_bytes / (1024**2)
}
# 使用例
tuner = ZabbixPerformanceTuner()
optimal_settings = tuner.calculate_optimal_settings(host_count=5000, item_count=50000)
tuner.apply_settings(optimal_settings)
データベース最適化
MySQL/MariaDB 最適化設定
bash
# /etc/mysql/conf.d/zabbix.cnf
[mysqld]
# 基本設定
character-set-server = utf8mb4
collation-server = utf8mb4_bin
default-authentication-plugin = mysql_native_password
# メモリ設定(16GB RAM想定)
innodb_buffer_pool_size = 12G
innodb_buffer_pool_instances = 8
innodb_log_buffer_size = 64M
key_buffer_size = 512M
tmp_table_size = 512M
max_heap_table_size = 512M
# ログ設定
innodb_log_file_size = 1G
innodb_log_files_in_group = 2
innodb_flush_log_at_trx_commit = 2
innodb_flush_method = O_DIRECT
# 接続設定
max_connections = 1000
max_connect_errors = 100000
connect_timeout = 60
wait_timeout = 28800
interactive_timeout = 28800
# クエリキャッシュ設定
query_cache_type = 1
query_cache_size = 512M
query_cache_limit = 32M
# スレッド設定
thread_cache_size = 128
thread_stack = 256K
# テーブル設定
table_open_cache = 8192
table_definition_cache = 4096
# Zabbix特有の最適化
# パーティション有効化
innodb_file_per_table = 1
# 並列読み取り
innodb_read_io_threads = 8
innodb_write_io_threads = 8
# 圧縮設定
innodb_compression_algorithm = zlib
innodb_compression_level = 6
PostgreSQL最適化設定
bash
# /etc/postgresql/14/main/postgresql.conf
# メモリ設定
shared_buffers = 4GB
effective_cache_size = 12GB
work_mem = 256MB
maintenance_work_mem = 1GB
# WAL設定
wal_buffers = 64MB
checkpoint_completion_target = 0.9
max_wal_size = 4GB
min_wal_size = 1GB
# 接続設定
max_connections = 500
shared_preload_libraries = 'pg_stat_statements'
# ログ設定
log_statement = 'none'
log_duration = off
log_line_prefix = '%t [%p]: [%l-1] user=%u,db=%d,app=%a,client=%h '
# 自動vacuum設定
autovacuum = on
autovacuum_max_workers = 6
autovacuum_naptime = 30s
# Zabbix最適化
random_page_cost = 1.1
effective_io_concurrency = 200
監視データのパーティショニング
MySQL パーティション設定
sql
-- 履歴テーブルのパーティション設定
DELIMITER //
CREATE PROCEDURE create_zabbix_partitions()
BEGIN
DECLARE done INT DEFAULT FALSE;
DECLARE table_name VARCHAR(64);
DECLARE cur CURSOR FOR
SELECT TABLE_NAME
FROM INFORMATION_SCHEMA.TABLES
WHERE TABLE_SCHEMA = 'zabbix'
AND TABLE_NAME LIKE 'history%';
DECLARE CONTINUE HANDLER FOR NOT FOUND SET done = TRUE;
OPEN cur;
read_loop: LOOP
FETCH cur INTO table_name;
IF done THEN
LEAVE read_loop;
END IF;
-- 月次パーティション作成
SET @sql = CONCAT('ALTER TABLE ', table_name, '
PARTITION BY RANGE (UNIX_TIMESTAMP(FROM_UNIXTIME(clock, "%Y-%m-01"))) (
PARTITION p202401 VALUES LESS THAN (UNIX_TIMESTAMP("2024-02-01")),
PARTITION p202402 VALUES LESS THAN (UNIX_TIMESTAMP("2024-03-01")),
PARTITION p202403 VALUES LESS THAN (UNIX_TIMESTAMP("2024-04-01")),
PARTITION p202404 VALUES LESS THAN (UNIX_TIMESTAMP("2024-05-01")),
PARTITION p202405 VALUES LESS THAN (UNIX_TIMESTAMP("2024-06-01")),
PARTITION p202406 VALUES LESS THAN (UNIX_TIMESTAMP("2024-07-01")),
PARTITION p202407 VALUES LESS THAN (UNIX_TIMESTAMP("2024-08-01")),
PARTITION p202408 VALUES LESS THAN (UNIX_TIMESTAMP("2024-09-01")),
PARTITION p202409 VALUES LESS THAN (UNIX_TIMESTAMP("2024-10-01")),
PARTITION p202410 VALUES LESS THAN (UNIX_TIMESTAMP("2024-11-01")),
PARTITION p202411 VALUES LESS THAN (UNIX_TIMESTAMP("2024-12-01")),
PARTITION p202412 VALUES LESS THAN (UNIX_TIMESTAMP("2025-01-01")),
PARTITION pfuture VALUES LESS THAN MAXVALUE
)');
PREPARE stmt FROM @sql;
EXECUTE stmt;
DEALLOCATE PREPARE stmt;
END LOOP;
CLOSE cur;
END //
DELIMITER ;
-- パーティション管理の自動化
CREATE EVENT zabbix_partition_maintenance
ON SCHEDULE EVERY 1 MONTH
STARTS '2024-01-01 02:00:00'
DO
BEGIN
-- 古いパーティションの削除(3ヶ月より古い)
SET @drop_date = DATE_SUB(CURDATE(), INTERVAL 3 MONTH);
SET @partition_name = CONCAT('p', DATE_FORMAT(@drop_date, '%Y%m'));
-- 履歴テーブルのパーティション削除
SET @sql = CONCAT('ALTER TABLE history DROP PARTITION ', @partition_name);
PREPARE stmt FROM @sql;
EXECUTE stmt;
DEALLOCATE PREPARE stmt;
-- 新しいパーティションの追加
SET @add_date = DATE_ADD(CURDATE(), INTERVAL 12 MONTH);
SET @new_partition = CONCAT('p', DATE_FORMAT(@add_date, '%Y%m'));
SET @next_date = DATE_ADD(@add_date, INTERVAL 1 MONTH);
SET @sql = CONCAT('ALTER TABLE history ADD PARTITION (
PARTITION ', @new_partition, ' VALUES LESS THAN (UNIX_TIMESTAMP("',
DATE_FORMAT(@next_date, '%Y-%m-%d'), '"))
)');
PREPARE stmt FROM @sql;
EXECUTE stmt;
DEALLOCATE PREPARE stmt;
END;
容量計画とスケーリング
容量計画の計算式
データサイズ計算
python
#!/usr/bin/env python3
# Zabbix Capacity Planning Calculator
class ZabbixCapacityPlanner:
def __init__(self):
# ベースサイズ(バイト)
self.history_record_size = 90 # 履歴レコード
self.trend_record_size = 128 # トレンドレコード
self.event_record_size = 512 # イベントレコード
def calculate_storage_requirements(self, hosts, items_per_host,
history_retention_days=30,
trend_retention_days=365):
"""ストレージ要件の計算"""
total_items = hosts * items_per_host
# 1日あたりのデータポイント(1分間隔想定)
data_points_per_day = total_items * 1440 # 24 * 60
# 履歴データサイズ
history_daily_size = data_points_per_day * self.history_record_size
history_total_size = history_daily_size * history_retention_days
# トレンドデータサイズ(1時間間隔)
trend_daily_size = (total_items * 24) * self.trend_record_size
trend_total_size = trend_daily_size * trend_retention_days
# イベントデータサイズ(想定:1ホストあたり5イベント/日)
events_daily_size = hosts * 5 * self.event_record_size
events_total_size = events_daily_size * history_retention_days
# 設定データ(テンプレート、ホスト、アイテムなど)
config_size = (hosts * 50 * 1024) + (total_items * 2 * 1024) # 概算
# 合計サイズ
total_size = (history_total_size + trend_total_size +
events_total_size + config_size)
return {
'hosts': hosts,
'total_items': total_items,
'data_points_per_day': data_points_per_day,
'history_size_gb': history_total_size / (1024**3),
'trend_size_gb': trend_total_size / (1024**3),
'events_size_gb': events_total_size / (1024**3),
'config_size_gb': config_size / (1024**3),
'total_size_gb': total_size / (1024**3),
'daily_growth_gb': (history_daily_size + trend_daily_size +
events_daily_size) / (1024**3)
}
def calculate_server_requirements(self, hosts, items_per_host):
"""サーバー要件の計算"""
total_items = hosts * items_per_host
nvps = total_items / 60 # New Values Per Second
# CPU要件(経験的数値)
cpu_cores = max(4, int(nvps / 1000) * 2)
# メモリ要件
base_memory = 4 # GB
cache_memory = (total_items * 150) / (1024**3) # 150バイト/アイテム
total_memory = base_memory + cache_memory
# ネットワーク要件
network_mbps = (nvps * 50 * 8) / (1024**2) # 50バイト/値
# ディスクI/O要件
disk_iops = nvps * 2 # 書き込み + インデックス更新
return {
'nvps': nvps,
'cpu_cores': cpu_cores,
'memory_gb': total_memory,
'network_mbps': network_mbps,
'disk_iops': disk_iops,
'recommended_memory_gb': total_memory * 1.5, # 余裕を見る
'recommended_cpu_cores': cpu_cores * 1.2
}
def generate_scaling_plan(self, current_hosts, target_hosts,
growth_period_months=12):
"""スケーリング計画の生成"""
monthly_growth = (target_hosts - current_hosts) / growth_period_months
scaling_plan = []
for month in range(1, growth_period_months + 1):
projected_hosts = current_hosts + (monthly_growth * month)
storage_req = self.calculate_storage_requirements(
int(projected_hosts), 50 # 50アイテム/ホスト想定
)
server_req = self.calculate_server_requirements(
int(projected_hosts), 50
)
scaling_plan.append({
'month': month,
'hosts': int(projected_hosts),
'storage_gb': storage_req['total_size_gb'],
'memory_gb': server_req['recommended_memory_gb'],
'cpu_cores': server_req['recommended_cpu_cores'],
'nvps': server_req['nvps']
})
return scaling_plan
# 使用例
planner = ZabbixCapacityPlanner()
# 現在の要件計算
current_req = planner.calculate_storage_requirements(
hosts=1000,
items_per_host=50
)
print("Current Storage Requirements:")
print(f"Total Size: {current_req['total_size_gb']:.2f} GB")
print(f"Daily Growth: {current_req['daily_growth_gb']:.2f} GB")
# サーバー要件計算
server_req = planner.calculate_server_requirements(
hosts=1000,
items_per_host=50
)
print("\nServer Requirements:")
print(f"CPU Cores: {server_req['recommended_cpu_cores']:.0f}")
print(f"Memory: {server_req['recommended_memory_gb']:.1f} GB")
print(f"NVPS: {server_req['nvps']:.0f}")
# スケーリング計画
scaling_plan = planner.generate_scaling_plan(
current_hosts=1000,
target_hosts=5000,
growth_period_months=12
)
print("\nScaling Plan (Next 12 months):")
for plan in scaling_plan[::3]: # 3ヶ月ごとに表示
print(f"Month {plan['month']}: {plan['hosts']} hosts, "
f"{plan['storage_gb']:.0f}GB storage, "
f"{plan['memory_gb']:.0f}GB RAM")
自動スケーリング(Kubernetes)
Zabbix Server Deployment
yaml
# zabbix-server-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: zabbix-server
namespace: monitoring
spec:
replicas: 2
selector:
matchLabels:
app: zabbix-server
template:
metadata:
labels:
app: zabbix-server
spec:
containers:
- name: zabbix-server
image: zabbix/zabbix-server-mysql:6.4-alpine-latest
env:
- name: DB_SERVER_HOST
value: "mysql-primary"
- name: MYSQL_DATABASE
value: "zabbix"
- name: MYSQL_USER
valueFrom:
secretKeyRef:
name: mysql-secret
key: username
- name: MYSQL_PASSWORD
valueFrom:
secretKeyRef:
name: mysql-secret
key: password
- name: ZBX_HANODENAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: ZBX_NODEADDRESS
valueFrom:
fieldRef:
fieldPath: status.podIP
resources:
requests:
memory: "2Gi"
cpu: "1000m"
limits:
memory: "8Gi"
cpu: "4000m"
ports:
- containerPort: 10051
livenessProbe:
tcpSocket:
port: 10051
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
tcpSocket:
port: 10051
initialDelaySeconds: 5
periodSeconds: 5
---
apiVersion: v1
kind: Service
metadata:
name: zabbix-server
namespace: monitoring
spec:
selector:
app: zabbix-server
ports:
- port: 10051
targetPort: 10051
type: ClusterIP
---
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: zabbix-server-hpa
namespace: monitoring
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: zabbix-server
minReplicas: 2
maxReplicas: 5
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: 80
監視とアラート
クラスタ監視スクリプト
python
#!/usr/bin/env python3
# Zabbix Cluster Health Monitor
import requests
import json
import subprocess
import time
from datetime import datetime
class ZabbixClusterMonitor:
def __init__(self, nodes, check_interval=60):
self.nodes = nodes
self.check_interval = check_interval
self.results = {}
def check_node_health(self, node):
"""ノードのヘルスチェック"""
health_status = {
'node': node['name'],
'timestamp': datetime.now().isoformat(),
'status': 'unknown',
'checks': {}
}
try:
# Zabbixサーバープロセスチェック
health_status['checks']['process'] = self._check_process(node)
# ポートチェック
health_status['checks']['port'] = self._check_port(node)
# API応答チェック
health_status['checks']['api'] = self._check_api(node)
# データベース接続チェック
health_status['checks']['database'] = self._check_database(node)
# リソース使用率チェック
health_status['checks']['resources'] = self._check_resources(node)
# 全体ステータス判定
all_checks = [health_status['checks'][key]['status']
for key in health_status['checks']]
if all(status == 'ok' for status in all_checks):
health_status['status'] = 'healthy'
elif any(status == 'critical' for status in all_checks):
health_status['status'] = 'critical'
else:
health_status['status'] = 'warning'
except Exception as e:
health_status['status'] = 'error'
health_status['error'] = str(e)
return health_status
def _check_process(self, node):
"""プロセスチェック"""
try:
result = subprocess.run(
['ssh', node['host'], 'pgrep -f zabbix_server'],
capture_output=True, text=True, timeout=10
)
return {
'status': 'ok' if result.returncode == 0 else 'critical',
'detail': 'Process running' if result.returncode == 0 else 'Process not found'
}
except Exception as e:
return {'status': 'error', 'detail': str(e)}
def _check_port(self, node):
"""ポートチェック"""
try:
result = subprocess.run(
['nc', '-z', node['host'], '10051'],
capture_output=True, timeout=5
)
return {
'status': 'ok' if result.returncode == 0 else 'critical',
'detail': 'Port accessible' if result.returncode == 0 else 'Port not accessible'
}
except Exception as e:
return {'status': 'error', 'detail': str(e)}
def _check_api(self, node):
"""API応答チェック"""
try:
response = requests.post(
f"http://{node['host']}/zabbix/api_jsonrpc.php",
json={
"jsonrpc": "2.0",
"method": "apiinfo.version",
"params": {},
"id": 1
},
timeout=10
)
if response.status_code == 200 and 'result' in response.json():
return {
'status': 'ok',
'detail': f"API responsive, version: {response.json()['result']}"
}
else:
return {
'status': 'warning',
'detail': f"API error: {response.status_code}"
}
except Exception as e:
return {'status': 'critical', 'detail': str(e)}
def _check_database(self, node):
"""データベース接続チェック"""
try:
result = subprocess.run([
'ssh', node['host'],
'mysql -u zabbix -ppassword zabbix -e "SELECT 1" 2>/dev/null'
], capture_output=True, timeout=10)
return {
'status': 'ok' if result.returncode == 0 else 'critical',
'detail': 'DB connected' if result.returncode == 0 else 'DB connection failed'
}
except Exception as e:
return {'status': 'error', 'detail': str(e)}
def _check_resources(self, node):
"""リソース使用率チェック"""
try:
# CPU使用率
cpu_result = subprocess.run([
'ssh', node['host'],
"top -bn1 | grep 'Cpu(s)' | awk '{print $2}' | cut -d'%' -f1"
], capture_output=True, text=True, timeout=10)
# メモリ使用率
mem_result = subprocess.run([
'ssh', node['host'],
"free | grep Mem | awk '{print ($3/$2) * 100.0}'"
], capture_output=True, text=True, timeout=10)
cpu_usage = float(cpu_result.stdout.strip())
mem_usage = float(mem_result.stdout.strip())
status = 'ok'
if cpu_usage > 90 or mem_usage > 90:
status = 'critical'
elif cpu_usage > 70 or mem_usage > 70:
status = 'warning'
return {
'status': status,
'detail': f"CPU: {cpu_usage:.1f}%, Memory: {mem_usage:.1f}%",
'cpu_usage': cpu_usage,
'memory_usage': mem_usage
}
except Exception as e:
return {'status': 'error', 'detail': str(e)}
def run_monitoring(self):
"""継続的監視の実行"""
while True:
cluster_status = {
'timestamp': datetime.now().isoformat(),
'nodes': [],
'cluster_status': 'unknown'
}
for node in self.nodes:
node_status = self.check_node_health(node)
cluster_status['nodes'].append(node_status)
# クラスタ全体ステータス判定
node_statuses = [node['status'] for node in cluster_status['nodes']]
if all(status == 'healthy' for status in node_statuses):
cluster_status['cluster_status'] = 'healthy'
elif any(status == 'critical' for status in node_statuses):
cluster_status['cluster_status'] = 'degraded'
else:
cluster_status['cluster_status'] = 'warning'
# 結果の出力
print(json.dumps(cluster_status, indent=2))
# アラート送信(必要に応じて)
if cluster_status['cluster_status'] != 'healthy':
self._send_alert(cluster_status)
time.sleep(self.check_interval)
def _send_alert(self, cluster_status):
"""アラート送信"""
# Slack、メール等への通知実装
pass
# 設定例
nodes = [
{'name': 'zabbix-node1', 'host': '192.168.1.10'},
{'name': 'zabbix-node2', 'host': '192.168.1.11'},
{'name': 'zabbix-node3', 'host': '192.168.1.12'}
]
# 監視開始
monitor = ZabbixClusterMonitor(nodes, check_interval=60)
monitor.run_monitoring()
参考リンク
本セクションでは、Zabbixの高可用性とスケーリングについて詳しく説明しました。次のセクション「カスタマイゼーション」では、Zabbixの機能拡張について説明します。