8.3 トラブルシューティング
Zabbix環境で発生する一般的な問題の特定・分析・解決手法の体系的アプローチ
概要
Zabbixトラブルシューティングは、監視システムで発生する様々な問題を迅速かつ的確に解決するための重要なスキルです。体系的な問題分析アプローチと豊富な解決事例により、サービス品質の維持と運用効率の向上を実現できます。
トラブルシューティングの重要性
要素 | 影響範囲 | 対応効果 |
---|---|---|
迅速な問題特定 | サービス停止時間短縮 | 可用性向上・損失削減 |
根本原因分析 | 再発防止・品質向上 | 長期安定性・信頼性 |
予防的対策 | 障害未然防止 | 運用コスト削減 |
知識蓄積 | 技術力向上・標準化 | 組織能力・効率性 |
自動化 | 人的ミス削減 | 運用品質・一貫性 |
問題分析の体系的アプローチ
ITIL準拠問題管理プロセス
問題分類フレームワーク
yaml
# 問題分類体系
問題カテゴリ:
レベル1 - サービス影響:
Critical: "全サービス停止"
High: "主要機能影響"
Medium: "部分機能影響"
Low: "軽微な問題"
レベル2 - 技術領域:
Infrastructure: "OS・ハードウェア"
Application: "Zabbix本体"
Database: "データベース関連"
Network: "ネットワーク・通信"
Security: "セキュリティ・認証"
Performance: "性能・レスポンス"
レベル3 - 具体的症状:
Connection: "接続問題"
Data: "データ関連"
Configuration: "設定問題"
Resource: "リソース不足"
Bug: "ソフトウェア欠陥"
Change: "変更起因"
# 問題解決プロセス
解決プロセス:
Step 1 - 初期対応:
- 症状確認・記録
- 影響範囲特定
- 緊急回避策実施
- ステークホルダー通知
Step 2 - 問題分析:
- ログ収集・分析
- 環境情報収集
- 再現テスト実施
- 根本原因特定
Step 3 - 解決実装:
- 解決策設計
- テスト環境検証
- 本番環境適用
- 効果確認
Step 4 - 事後対応:
- 再発防止策実装
- ドキュメント更新
- 知識ベース登録
- プロセス改善
診断ツールセット
統合診断スクリプト
bash
#!/bin/bash
# Zabbix統合診断スクリプト
SCRIPT_NAME="zabbix-diagnostic"
VERSION="2.1"
LOG_FILE="/var/log/zabbix/diagnostic_$(date +%Y%m%d_%H%M%S).log"
REPORT_FILE="/tmp/zabbix_diagnostic_report.txt"
CONFIG_FILE="/etc/zabbix/zabbix_server.conf"
# カラー出力設定
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# ログ関数
log_message() {
local level="$1"
local message="$2"
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
echo "[$timestamp] [$level] $message" | tee -a "$LOG_FILE"
case "$level" in
"ERROR") echo -e "${RED}✗ $message${NC}" ;;
"WARNING") echo -e "${YELLOW}⚠ $message${NC}" ;;
"SUCCESS") echo -e "${GREEN}✓ $message${NC}" ;;
"INFO") echo -e "${BLUE}ℹ $message${NC}" ;;
*) echo "$message" ;;
esac
}
# システム情報収集
collect_system_info() {
log_message "INFO" "Collecting system information..."
cat > "$REPORT_FILE" << EOF
# Zabbix Diagnostic Report
Generated: $(date)
Hostname: $(hostname)
OS: $(cat /etc/os-release | grep PRETTY_NAME | cut -d'"' -f2)
Kernel: $(uname -r)
Uptime: $(uptime)
## System Resources
### CPU Information
$(lscpu | grep -E "(Architecture|CPU|Thread|Socket)")
### Memory Information
$(free -h)
### Disk Usage
$(df -h | grep -E '^/dev/')
### Network Interfaces
$(ip addr show | grep -E '^[0-9]|inet ')
EOF
}
# Zabbixプロセス状況確認
check_zabbix_processes() {
log_message "INFO" "Checking Zabbix processes..."
echo "## Zabbix Process Status" >> "$REPORT_FILE"
# Zabbixサーバープロセス確認
if pgrep -f zabbix_server > /dev/null; then
log_message "SUCCESS" "Zabbix server process is running"
local server_pid=$(pgrep -f zabbix_server)
local server_threads=$(ps -o thcount -p "$server_pid" | tail -1)
echo "### Zabbix Server" >> "$REPORT_FILE"
echo "- Status: Running" >> "$REPORT_FILE"
echo "- PID: $server_pid" >> "$REPORT_FILE"
echo "- Threads: $server_threads" >> "$REPORT_FILE"
# プロセス詳細情報
echo "- Process Details:" >> "$REPORT_FILE"
ps aux | grep zabbix_server | grep -v grep >> "$REPORT_FILE"
else
log_message "ERROR" "Zabbix server process is not running"
echo "### Zabbix Server" >> "$REPORT_FILE"
echo "- Status: NOT RUNNING" >> "$REPORT_FILE"
# サービス状況確認
if systemctl is-enabled zabbix-server >/dev/null 2>&1; then
local service_status=$(systemctl is-active zabbix-server)
echo "- Service Status: $service_status" >> "$REPORT_FILE"
fi
fi
# Zabbixエージェントプロセス確認
if pgrep -f zabbix_agentd > /dev/null; then
log_message "SUCCESS" "Zabbix agent process is running"
echo "### Zabbix Agent" >> "$REPORT_FILE"
echo "- Status: Running" >> "$REPORT_FILE"
ps aux | grep zabbix_agentd | grep -v grep >> "$REPORT_FILE"
else
log_message "WARNING" "Zabbix agent process is not running"
echo "### Zabbix Agent" >> "$REPORT_FILE"
echo "- Status: NOT RUNNING" >> "$REPORT_FILE"
fi
}
# 設定ファイル検証
verify_configuration() {
log_message "INFO" "Verifying Zabbix configuration..."
echo "## Configuration Verification" >> "$REPORT_FILE"
# Zabbixサーバー設定検証
if [ -f "$CONFIG_FILE" ]; then
log_message "INFO" "Found Zabbix server configuration file"
# 設定テスト実行
local config_test=$(zabbix_server -t 2>&1)
local test_result=$?
echo "### Server Configuration Test" >> "$REPORT_FILE"
if [ $test_result -eq 0 ]; then
log_message "SUCCESS" "Zabbix server configuration is valid"
echo "- Result: PASSED" >> "$REPORT_FILE"
else
log_message "ERROR" "Zabbix server configuration has errors"
echo "- Result: FAILED" >> "$REPORT_FILE"
echo "- Errors:" >> "$REPORT_FILE"
echo "$config_test" >> "$REPORT_FILE"
fi
# 重要設定項目確認
echo "### Key Configuration Parameters" >> "$REPORT_FILE"
grep -E "^(DBHost|DBName|DBUser|LogFile|PidFile|ListenPort|StartPollers)" "$CONFIG_FILE" >> "$REPORT_FILE"
else
log_message "ERROR" "Zabbix server configuration file not found: $CONFIG_FILE"
echo "### Server Configuration" >> "$REPORT_FILE"
echo "- Status: Configuration file not found" >> "$REPORT_FILE"
fi
# エージェント設定確認
local agent_config="/etc/zabbix/zabbix_agentd.conf"
if [ -f "$agent_config" ]; then
echo "### Agent Configuration" >> "$REPORT_FILE"
grep -E "^(Server|ServerActive|Hostname|ListenPort)" "$agent_config" >> "$REPORT_FILE"
fi
}
# データベース接続確認
check_database_connectivity() {
log_message "INFO" "Checking database connectivity..."
echo "## Database Connectivity" >> "$REPORT_FILE"
# 設定からDB情報抽出
local db_host=$(grep "^DBHost=" "$CONFIG_FILE" | cut -d'=' -f2)
local db_name=$(grep "^DBName=" "$CONFIG_FILE" | cut -d'=' -f2)
local db_user=$(grep "^DBUser=" "$CONFIG_FILE" | cut -d'=' -f2)
local db_password=$(grep "^DBPassword=" "$CONFIG_FILE" | cut -d'=' -f2)
if [ -n "$db_host" ] && [ -n "$db_name" ] && [ -n "$db_user" ]; then
# データベース接続テスト
if mysql -h "$db_host" -u "$db_user" -p"$db_password" -e "SELECT 1;" "$db_name" >/dev/null 2>&1; then
log_message "SUCCESS" "Database connection successful"
echo "- Connection: SUCCESS" >> "$REPORT_FILE"
# データベース統計情報
echo "### Database Statistics" >> "$REPORT_FILE"
mysql -h "$db_host" -u "$db_user" -p"$db_password" -e "
SELECT
ROUND(SUM(data_length + index_length) / 1024 / 1024, 2) AS 'DB Size (MB)',
COUNT(*) AS 'Total Tables'
FROM information_schema.tables
WHERE table_schema='$db_name';" "$db_name" >> "$REPORT_FILE"
# 接続数情報
echo "### Connection Information" >> "$REPORT_FILE"
mysql -h "$db_host" -u "$db_user" -p"$db_password" -e "
SHOW STATUS LIKE 'Threads_connected';
SHOW STATUS LIKE 'Max_used_connections';
SHOW VARIABLES LIKE 'max_connections';" >> "$REPORT_FILE"
else
log_message "ERROR" "Database connection failed"
echo "- Connection: FAILED" >> "$REPORT_FILE"
fi
else
log_message "ERROR" "Database configuration parameters not found"
echo "- Configuration: INCOMPLETE" >> "$REPORT_FILE"
fi
}
# ネットワーク接続確認
check_network_connectivity() {
log_message "INFO" "Checking network connectivity..."
echo "## Network Connectivity" >> "$REPORT_FILE"
# ポート使用状況確認
echo "### Port Usage" >> "$REPORT_FILE"
# Zabbixサーバーポート確認
if netstat -ln | grep -q ":10051.*LISTEN"; then
log_message "SUCCESS" "Zabbix server port 10051 is listening"
echo "- Server Port 10051: LISTENING" >> "$REPORT_FILE"
else
log_message "ERROR" "Zabbix server port 10051 is not listening"
echo "- Server Port 10051: NOT LISTENING" >> "$REPORT_FILE"
fi
# Zabbixエージェントポート確認
if netstat -ln | grep -q ":10050.*LISTEN"; then
log_message "SUCCESS" "Zabbix agent port 10050 is listening"
echo "- Agent Port 10050: LISTENING" >> "$REPORT_FILE"
else
log_message "WARNING" "Zabbix agent port 10050 is not listening"
echo "- Agent Port 10050: NOT LISTENING" >> "$REPORT_FILE"
fi
# 全ポート使用状況
echo "### Active Zabbix Ports" >> "$REPORT_FILE"
netstat -ln | grep -E ":(10050|10051)" >> "$REPORT_FILE"
# エージェント接続テスト
echo "### Agent Connectivity Test" >> "$REPORT_FILE"
if command -v zabbix_get >/dev/null; then
local agent_test=$(zabbix_get -s localhost -k agent.ping 2>&1)
if [ "$agent_test" = "1" ]; then
log_message "SUCCESS" "Local agent connectivity test passed"
echo "- Local Agent Test: SUCCESS" >> "$REPORT_FILE"
else
log_message "WARNING" "Local agent connectivity test failed"
echo "- Local Agent Test: FAILED" >> "$REPORT_FILE"
echo "- Error: $agent_test" >> "$REPORT_FILE"
fi
else
log_message "WARNING" "zabbix_get command not available"
echo "- Agent Test: TOOL NOT AVAILABLE" >> "$REPORT_FILE"
fi
}
# ログファイル解析
analyze_log_files() {
log_message "INFO" "Analyzing Zabbix log files..."
echo "## Log File Analysis" >> "$REPORT_FILE"
# Zabbixサーバーログ解析
local server_log="/var/log/zabbix/zabbix_server.log"
if [ -f "$server_log" ]; then
echo "### Server Log Analysis" >> "$REPORT_FILE"
# 最近のエラー
echo "#### Recent Errors (Last 24 hours)" >> "$REPORT_FILE"
grep -i error "$server_log" | tail -20 >> "$REPORT_FILE"
# 警告メッセージ
echo "#### Recent Warnings (Last 100 lines)" >> "$REPORT_FILE"
grep -i warning "$server_log" | tail -10 >> "$REPORT_FILE"
# データベース関連エラー
echo "#### Database Related Issues" >> "$REPORT_FILE"
grep -i -E "(database|mysql|postgresql|connection)" "$server_log" | tail -10 >> "$REPORT_FILE"
# 統計情報
echo "#### Error Statistics (Last 1000 lines)" >> "$REPORT_FILE"
tail -1000 "$server_log" | grep -c -i error > /tmp/error_count
tail -1000 "$server_log" | grep -c -i warning > /tmp/warning_count
echo "- Errors: $(cat /tmp/error_count)" >> "$REPORT_FILE"
echo "- Warnings: $(cat /tmp/warning_count)" >> "$REPORT_FILE"
else
log_message "WARNING" "Zabbix server log file not found: $server_log"
echo "### Server Log" >> "$REPORT_FILE"
echo "- Status: LOG FILE NOT FOUND" >> "$REPORT_FILE"
fi
# システムログ確認
echo "### System Log Analysis" >> "$REPORT_FILE"
echo "#### Recent Zabbix Related System Messages" >> "$REPORT_FILE"
journalctl -u zabbix-server --since "1 hour ago" --no-pager | tail -20 >> "$REPORT_FILE"
}
# パフォーマンス分析
analyze_performance() {
log_message "INFO" "Analyzing system performance..."
echo "## Performance Analysis" >> "$REPORT_FILE"
# システムリソース使用状況
echo "### Resource Utilization" >> "$REPORT_FILE"
# CPU使用率
local cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | sed 's/%us,//')
echo "- CPU Usage: ${cpu_usage}%" >> "$REPORT_FILE"
# メモリ使用率
local mem_info=$(free | grep Mem)
local total_mem=$(echo $mem_info | awk '{print $2}')
local used_mem=$(echo $mem_info | awk '{print $3}')
local mem_usage=$(echo "scale=2; $used_mem * 100 / $total_mem" | bc)
echo "- Memory Usage: ${mem_usage}%" >> "$REPORT_FILE"
# ディスクI/O
echo "### Disk I/O Statistics" >> "$REPORT_FILE"
if command -v iostat >/dev/null; then
iostat -x 1 3 | tail -10 >> "$REPORT_FILE"
else
echo "- iostat not available" >> "$REPORT_FILE"
fi
# Zabbixプロセス性能
echo "### Zabbix Process Performance" >> "$REPORT_FILE"
if pgrep -f zabbix_server > /dev/null; then
local server_pid=$(pgrep -f zabbix_server)
echo "#### Server Process Resource Usage" >> "$REPORT_FILE"
ps -p "$server_pid" -o pid,pcpu,pmem,vsz,rss,etime,cmd >> "$REPORT_FILE"
# プロセス別CPU使用率
echo "#### Zabbix Process Details" >> "$REPORT_FILE"
ps aux | grep zabbix | grep -v grep | sort -k3 -nr >> "$REPORT_FILE"
fi
}
# 問題の特定と推奨事項
identify_issues_and_recommendations() {
log_message "INFO" "Identifying issues and generating recommendations..."
echo "## Issues Identified and Recommendations" >> "$REPORT_FILE"
local issues=0
# Zabbixサーバープロセス確認
if ! pgrep -f zabbix_server > /dev/null; then
echo "### 🚨 CRITICAL: Zabbix Server Not Running" >> "$REPORT_FILE"
echo "**Recommendation**: Start Zabbix server service immediately" >> "$REPORT_FILE"
echo "\`\`\`bash" >> "$REPORT_FILE"
echo "systemctl start zabbix-server" >> "$REPORT_FILE"
echo "systemctl status zabbix-server" >> "$REPORT_FILE"
echo "\`\`\`" >> "$REPORT_FILE"
echo "" >> "$REPORT_FILE"
issues=$((issues + 1))
fi
# データベース接続確認
local db_host=$(grep "^DBHost=" "$CONFIG_FILE" | cut -d'=' -f2)
local db_name=$(grep "^DBName=" "$CONFIG_FILE" | cut -d'=' -f2)
local db_user=$(grep "^DBUser=" "$CONFIG_FILE" | cut -d'=' -f2)
local db_password=$(grep "^DBPassword=" "$CONFIG_FILE" | cut -d'=' -f2)
if [ -n "$db_host" ] && [ -n "$db_name" ] && [ -n "$db_user" ]; then
if ! mysql -h "$db_host" -u "$db_user" -p"$db_password" -e "SELECT 1;" "$db_name" >/dev/null 2>&1; then
echo "### 🚨 CRITICAL: Database Connection Failed" >> "$REPORT_FILE"
echo "**Recommendation**: Check database connectivity and credentials" >> "$REPORT_FILE"
echo "- Verify database server is running" >> "$REPORT_FILE"
echo "- Check network connectivity to database" >> "$REPORT_FILE"
echo "- Verify database credentials in configuration" >> "$REPORT_FILE"
echo "" >> "$REPORT_FILE"
issues=$((issues + 1))
fi
fi
# ポート確認
if ! netstat -ln | grep -q ":10051.*LISTEN"; then
echo "### ⚠️ WARNING: Zabbix Server Port Not Listening" >> "$REPORT_FILE"
echo "**Recommendation**: Check Zabbix server configuration and firewall" >> "$REPORT_FILE"
echo "\`\`\`bash" >> "$REPORT_FILE"
echo "# Check if port is configured correctly" >> "$REPORT_FILE"
echo "grep ListenPort /etc/zabbix/zabbix_server.conf" >> "$REPORT_FILE"
echo "# Check firewall rules" >> "$REPORT_FILE"
echo "firewall-cmd --list-ports" >> "$REPORT_FILE"
echo "\`\`\`" >> "$REPORT_FILE"
echo "" >> "$REPORT_FILE"
issues=$((issues + 1))
fi
# リソース使用量確認
local cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | sed 's/%us,//')
if (( $(echo "$cpu_usage > 80" | bc -l) )); then
echo "### ⚠️ WARNING: High CPU Usage" >> "$REPORT_FILE"
echo "**Current CPU Usage**: ${cpu_usage}%" >> "$REPORT_FILE"
echo "**Recommendation**: Investigate high CPU usage" >> "$REPORT_FILE"
echo "- Check for runaway processes" >> "$REPORT_FILE"
echo "- Review Zabbix configuration for optimization" >> "$REPORT_FILE"
echo "- Consider scaling resources" >> "$REPORT_FILE"
echo "" >> "$REPORT_FILE"
issues=$((issues + 1))
fi
# メモリ使用量確認
local mem_info=$(free | grep Mem)
local total_mem=$(echo $mem_info | awk '{print $2}')
local used_mem=$(echo $mem_info | awk '{print $3}')
local mem_usage=$(echo "scale=2; $used_mem * 100 / $total_mem" | bc)
if (( $(echo "$mem_usage > 85" | bc -l) )); then
echo "### ⚠️ WARNING: High Memory Usage" >> "$REPORT_FILE"
echo "**Current Memory Usage**: ${mem_usage}%" >> "$REPORT_FILE"
echo "**Recommendation**: Investigate memory usage" >> "$REPORT_FILE"
echo "- Check for memory leaks" >> "$REPORT_FILE"
echo "- Review Zabbix cache settings" >> "$REPORT_FILE"
echo "- Consider adding more RAM" >> "$REPORT_FILE"
echo "" >> "$REPORT_FILE"
issues=$((issues + 1))
fi
# ディスク使用量確認
local disk_usage=$(df / | awk 'NR==2 {print $5}' | sed 's/%//')
if [ "$disk_usage" -gt 90 ]; then
echo "### 🚨 CRITICAL: Low Disk Space" >> "$REPORT_FILE"
echo "**Current Disk Usage**: ${disk_usage}%" >> "$REPORT_FILE"
echo "**Recommendation**: Free up disk space immediately" >> "$REPORT_FILE"
echo "- Clean up old log files" >> "$REPORT_FILE"
echo "- Archive old monitoring data" >> "$REPORT_FILE"
echo "- Check for large temporary files" >> "$REPORT_FILE"
echo "" >> "$REPORT_FILE"
issues=$((issues + 1))
fi
# 総合評価
echo "### Summary" >> "$REPORT_FILE"
if [ $issues -eq 0 ]; then
echo "✅ **Overall Status**: HEALTHY" >> "$REPORT_FILE"
echo "No critical issues detected. System appears to be running normally." >> "$REPORT_FILE"
log_message "SUCCESS" "No critical issues detected"
elif [ $issues -le 2 ]; then
echo "⚠️ **Overall Status**: ATTENTION REQUIRED" >> "$REPORT_FILE"
echo "Some issues detected that require attention. Please review recommendations above." >> "$REPORT_FILE"
log_message "WARNING" "$issues issues detected requiring attention"
else
echo "🚨 **Overall Status**: CRITICAL" >> "$REPORT_FILE"
echo "Multiple critical issues detected. Immediate action required." >> "$REPORT_FILE"
log_message "ERROR" "$issues critical issues detected"
fi
echo "**Total Issues Found**: $issues" >> "$REPORT_FILE"
}
# レポート最終化
finalize_report() {
echo "" >> "$REPORT_FILE"
echo "---" >> "$REPORT_FILE"
echo "**Report Generated**: $(date)" >> "$REPORT_FILE"
echo "**Script Version**: $VERSION" >> "$REPORT_FILE"
echo "**Log File**: $LOG_FILE" >> "$REPORT_FILE"
log_message "INFO" "Diagnostic report saved to: $REPORT_FILE"
log_message "INFO" "Detailed logs saved to: $LOG_FILE"
}
# メイン実行関数
main() {
local mode="${1:-full}"
echo "Zabbix Diagnostic Tool v$VERSION"
echo "=================================="
echo ""
log_message "INFO" "Starting Zabbix diagnostic (mode: $mode)..."
case "$mode" in
"quick")
check_zabbix_processes
check_database_connectivity
identify_issues_and_recommendations
;;
"config")
verify_configuration
;;
"network")
check_network_connectivity
;;
"performance")
analyze_performance
;;
"logs")
analyze_log_files
;;
"full"|*)
collect_system_info
check_zabbix_processes
verify_configuration
check_database_connectivity
check_network_connectivity
analyze_log_files
analyze_performance
identify_issues_and_recommendations
;;
esac
finalize_report
echo ""
echo "Diagnostic completed. Report available at: $REPORT_FILE"
# レポート表示(オプション)
if [ "${2:-}" = "--show" ]; then
echo ""
echo "=== DIAGNOSTIC REPORT ==="
cat "$REPORT_FILE"
fi
}
# ヘルプ表示
show_help() {
echo "Zabbix Diagnostic Tool v$VERSION"
echo ""
echo "USAGE:"
echo " $0 [mode] [options]"
echo ""
echo "MODES:"
echo " full Full diagnostic (default)"
echo " quick Quick health check"
echo " config Configuration verification only"
echo " network Network connectivity check"
echo " performance Performance analysis"
echo " logs Log file analysis"
echo ""
echo "OPTIONS:"
echo " --show Display report after generation"
echo " --help Show this help message"
echo ""
echo "EXAMPLES:"
echo " $0 # Full diagnostic"
echo " $0 quick --show # Quick check with report display"
echo " $0 performance # Performance analysis only"
}
# スクリプト実行
if [ "${1:-}" = "--help" ] || [ "${1:-}" = "-h" ]; then
show_help
exit 0
fi
main "$@"
一般的な問題と解決方法
サービス起動問題
Zabbixサーバー起動失敗
yaml
# 起動失敗の一般的原因と対処
起動失敗パターン:
設定ファイルエラー:
症状:
- "cannot parse config file"
- "invalid parameter"
- "configuration file error"
確認方法:
- zabbix_server -t
- journalctl -u zabbix-server
- /var/log/zabbix/zabbix_server.log
解決手順:
1. 設定ファイル構文確認
2. パラメータ値検証
3. ファイル権限確認
4. バックアップからの復旧
データベース接続エラー:
症状:
- "cannot connect to database"
- "access denied for user"
- "database connection timeout"
解決手順:
1. データベースサービス確認
2. 認証情報検証
3. ネットワーク接続確認
4. ファイアウォール設定確認
権限問題:
症状:
- "permission denied"
- "cannot write to log file"
- "cannot create PID file"
解決手順:
1. ファイル・ディレクトリ権限確認
2. SELinux設定確認
3. ユーザー・グループ設定確認
4. プロセス実行権限確認
リソース不足:
症状:
- "cannot allocate memory"
- "too many open files"
- "disk space full"
解決手順:
1. システムリソース確認
2. ulimit設定確認
3. ディスク容量確認
4. メモリ使用量最適化
自動解決スクリプト
bash
#!/bin/bash
# Zabbix起動問題自動解決スクリプト
SERVICE_NAME="zabbix-server"
CONFIG_FILE="/etc/zabbix/zabbix_server.conf"
LOG_FILE="/var/log/zabbix/startup_fix.log"
# ログ関数
log_fix() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
}
# 起動問題診断・修復
fix_startup_issues() {
log_fix "Starting Zabbix startup issue diagnosis..."
# 1. サービス状態確認
local service_status=$(systemctl is-active "$SERVICE_NAME")
local service_enabled=$(systemctl is-enabled "$SERVICE_NAME")
log_fix "Service status: $service_status"
log_fix "Service enabled: $service_enabled"
if [ "$service_status" != "active" ]; then
log_fix "Service is not active, attempting fixes..."
# 2. 設定ファイル検証・修復
if ! zabbix_server -t >/dev/null 2>&1; then
log_fix "Configuration file has errors, attempting to fix..."
# バックアップから復旧
if [ -f "${CONFIG_FILE}.backup" ]; then
log_fix "Restoring from backup configuration..."
cp "${CONFIG_FILE}.backup" "$CONFIG_FILE"
if zabbix_server -t >/dev/null 2>&1; then
log_fix "Configuration restored successfully from backup"
else
log_fix "Backup configuration also has errors"
fi
fi
# 基本的な設定エラー修正
fix_common_config_errors
fi
# 3. 権限問題修復
fix_permission_issues
# 4. リソース問題確認・修復
fix_resource_issues
# 5. データベース接続問題修復
fix_database_connectivity
# 6. サービス再起動試行
log_fix "Attempting to restart Zabbix server..."
systemctl restart "$SERVICE_NAME"
# 起動確認
sleep 5
if systemctl is-active "$SERVICE_NAME" >/dev/null; then
log_fix "✓ Zabbix server started successfully"
return 0
else
log_fix "✗ Zabbix server failed to start"
# 詳細エラー情報収集
log_fix "Error details:"
journalctl -u "$SERVICE_NAME" --since "1 minute ago" --no-pager | tail -10 | while read line; do
log_fix " $line"
done
return 1
fi
else
log_fix "✓ Zabbix server is already running"
return 0
fi
}
# 設定ファイル基本エラー修正
fix_common_config_errors() {
log_fix "Fixing common configuration errors..."
# 重複パラメータ削除
awk '!seen[$0]++' "$CONFIG_FILE" > "${CONFIG_FILE}.tmp" && mv "${CONFIG_FILE}.tmp" "$CONFIG_FILE"
# 無効な文字削除
sed -i 's/[[:cntrl:]]//g' "$CONFIG_FILE"
# 基本パラメータ確認・追加
if ! grep -q "^LogFile=" "$CONFIG_FILE"; then
echo "LogFile=/var/log/zabbix/zabbix_server.log" >> "$CONFIG_FILE"
log_fix "Added missing LogFile parameter"
fi
if ! grep -q "^PidFile=" "$CONFIG_FILE"; then
echo "PidFile=/run/zabbix/zabbix_server.pid" >> "$CONFIG_FILE"
log_fix "Added missing PidFile parameter"
fi
# 設定テスト
if zabbix_server -t >/dev/null 2>&1; then
log_fix "Configuration errors fixed"
else
log_fix "Configuration still has errors"
fi
}
# 権限問題修復
fix_permission_issues() {
log_fix "Fixing permission issues..."
# ディレクトリ作成・権限設定
mkdir -p /var/log/zabbix
mkdir -p /run/zabbix
chown zabbix:zabbix /var/log/zabbix
chown zabbix:zabbix /run/zabbix
chmod 755 /var/log/zabbix
chmod 755 /run/zabbix
# 設定ファイル権限
chown root:zabbix "$CONFIG_FILE"
chmod 640 "$CONFIG_FILE"
# SELinux設定確認
if command -v getenforce >/dev/null && [ "$(getenforce)" = "Enforcing" ]; then
log_fix "Checking SELinux settings..."
# SELinuxコンテキスト復旧
restorecon -R /var/log/zabbix
restorecon -R /run/zabbix
restorecon "$CONFIG_FILE"
# 必要なSELinuxポリシー設定
setsebool -P zabbix_can_network on 2>/dev/null || true
setsebool -P httpd_can_network_connect on 2>/dev/null || true
fi
log_fix "Permission issues fixed"
}
# リソース問題修復
fix_resource_issues() {
log_fix "Checking and fixing resource issues..."
# ディスク容量確認
local disk_usage=$(df /var | awk 'NR==2 {print $5}' | sed 's/%//')
if [ "$disk_usage" -gt 95 ]; then
log_fix "Critical disk space issue detected (${disk_usage}%)"
# ログローテーション強制実行
logrotate -f /etc/logrotate.conf
# 古いログファイル削除
find /var/log -name "*.log.*" -mtime +7 -delete
log_fix "Disk space cleanup completed"
fi
# メモリ確認
local mem_available=$(free | grep Available | awk '{print $2}')
if [ "$mem_available" -lt 1048576 ]; then # 1GB未満
log_fix "Low memory detected, clearing cache..."
sync && echo 3 > /proc/sys/vm/drop_caches
fi
# ulimit設定確認
local current_limit=$(ulimit -n)
if [ "$current_limit" -lt 4096 ]; then
log_fix "Increasing file descriptor limit..."
echo "zabbix soft nofile 65536" >> /etc/security/limits.conf
echo "zabbix hard nofile 65536" >> /etc/security/limits.conf
fi
}
# データベース接続修復
fix_database_connectivity() {
log_fix "Checking database connectivity..."
# データベース設定取得
local db_host=$(grep "^DBHost=" "$CONFIG_FILE" | cut -d'=' -f2)
local db_name=$(grep "^DBName=" "$CONFIG_FILE" | cut -d'=' -f2)
local db_user=$(grep "^DBUser=" "$CONFIG_FILE" | cut -d'=' -f2)
local db_password=$(grep "^DBPassword=" "$CONFIG_FILE" | cut -d'=' -f2)
if [ -n "$db_host" ] && [ -n "$db_name" ] && [ -n "$db_user" ]; then
# データベース接続テスト
if ! mysql -h "$db_host" -u "$db_user" -p"$db_password" -e "SELECT 1;" "$db_name" >/dev/null 2>&1; then
log_fix "Database connection failed, attempting fixes..."
# データベースサービス確認・起動
if systemctl is-active mysql >/dev/null || systemctl is-active mariadb >/dev/null; then
log_fix "Database service is running"
else
log_fix "Starting database service..."
systemctl start mysql 2>/dev/null || systemctl start mariadb 2>/dev/null
sleep 3
fi
# 再接続テスト
if mysql -h "$db_host" -u "$db_user" -p"$db_password" -e "SELECT 1;" "$db_name" >/dev/null 2>&1; then
log_fix "✓ Database connection restored"
else
log_fix "✗ Database connection still failing"
# ネットワーク接続確認
if [ "$db_host" != "localhost" ] && [ "$db_host" != "127.0.0.1" ]; then
if ! ping -c 1 "$db_host" >/dev/null 2>&1; then
log_fix "Network connectivity to database host failed"
fi
fi
fi
else
log_fix "✓ Database connection is working"
fi
else
log_fix "Database configuration parameters missing"
fi
}
# メイン実行
main() {
echo "Zabbix Startup Issue Resolver"
echo "============================="
if fix_startup_issues; then
echo "✓ Zabbix startup issues resolved successfully"
exit 0
else
echo "✗ Failed to resolve all startup issues"
echo "Check log file: $LOG_FILE"
exit 1
fi
}
# スクリプト実行
main "$@"
パフォーマンス問題
パフォーマンス問題の特定
性能監視・分析システム
python
#!/usr/bin/env python3
"""Zabbix性能分析・最適化システム"""
import psutil
import mysql.connector
import time
import json
import subprocess
from datetime import datetime, timedelta
from typing import Dict, List, Tuple
class ZabbixPerformanceAnalyzer:
def __init__(self, db_config: Dict):
self.db_config = db_config
self.connection = None
self.performance_data = {}
def connect_database(self):
"""データベース接続"""
try:
self.connection = mysql.connector.connect(**self.db_config)
return True
except Exception as e:
print(f"Database connection failed: {e}")
return False
def analyze_system_performance(self) -> Dict:
"""システム性能分析"""
print("Analyzing system performance...")
# CPU使用率
cpu_percent = psutil.cpu_percent(interval=1)
cpu_count = psutil.cpu_count()
# メモリ使用状況
memory = psutil.virtual_memory()
# ディスクI/O
disk_io = psutil.disk_io_counters()
# ネットワークI/O
network_io = psutil.net_io_counters()
# プロセス情報
zabbix_processes = []
for proc in psutil.process_iter(['pid', 'name', 'cpu_percent', 'memory_percent']):
if 'zabbix' in proc.info['name']:
zabbix_processes.append(proc.info)
return {
'timestamp': datetime.now().isoformat(),
'cpu': {
'usage_percent': cpu_percent,
'core_count': cpu_count,
'load_average': psutil.getloadavg()
},
'memory': {
'total': memory.total,
'available': memory.available,
'used': memory.used,
'usage_percent': memory.percent
},
'disk_io': {
'read_bytes': disk_io.read_bytes,
'write_bytes': disk_io.write_bytes,
'read_count': disk_io.read_count,
'write_count': disk_io.write_count
},
'network_io': {
'bytes_sent': network_io.bytes_sent,
'bytes_recv': network_io.bytes_recv,
'packets_sent': network_io.packets_sent,
'packets_recv': network_io.packets_recv
},
'zabbix_processes': zabbix_processes
}
def analyze_database_performance(self) -> Dict:
"""データベース性能分析"""
print("Analyzing database performance...")
if not self.connection:
return {}
cursor = self.connection.cursor(dictionary=True)
# データベース統計情報
queries = {
'status_variables': "SHOW STATUS",
'process_list': "SHOW PROCESSLIST",
'table_sizes': """
SELECT
table_name,
ROUND(((data_length + index_length) / 1024 / 1024), 2) AS 'size_mb',
table_rows
FROM information_schema.TABLES
WHERE table_schema = %s
ORDER BY (data_length + index_length) DESC
LIMIT 10
""",
'slow_queries': """
SELECT
DIGEST_TEXT,
COUNT_STAR,
AVG_TIMER_WAIT/1000000000 as avg_time_sec,
MAX_TIMER_WAIT/1000000000 as max_time_sec
FROM performance_schema.events_statements_summary_by_digest
WHERE AVG_TIMER_WAIT > 1000000000
ORDER BY AVG_TIMER_WAIT DESC
LIMIT 10
"""
}
results = {}
try:
# ステータス変数
cursor.execute(queries['status_variables'])
status_vars = {row['Variable_name']: row['Value'] for row in cursor.fetchall()}
results['status_variables'] = status_vars
# プロセスリスト
cursor.execute(queries['process_list'])
process_list = cursor.fetchall()
results['active_connections'] = len(process_list)
results['process_list'] = process_list
# テーブルサイズ
cursor.execute(queries['table_sizes'], (self.db_config['database'],))
results['largest_tables'] = cursor.fetchall()
# スロークエリ(performance_schemaが有効な場合)
try:
cursor.execute(queries['slow_queries'])
results['slow_queries'] = cursor.fetchall()
except:
results['slow_queries'] = []
except Exception as e:
print(f"Database analysis error: {e}")
return results
def analyze_zabbix_internals(self) -> Dict:
"""Zabbix内部メトリクス分析"""
print("Analyzing Zabbix internal metrics...")
if not self.connection:
return {}
cursor = self.connection.cursor(dictionary=True)
internal_queries = {
'host_count': "SELECT COUNT(*) as count FROM hosts WHERE status = 0",
'item_count': "SELECT COUNT(*) as count FROM items WHERE status = 0",
'trigger_count': "SELECT COUNT(*) as count FROM triggers WHERE status = 0",
'active_problems': "SELECT COUNT(*) as count FROM problem WHERE source = 0 AND object = 0",
'queue_stats': """
SELECT
type,
COUNT(*) as item_count,
AVG(delay) as avg_delay
FROM items i
JOIN hosts h ON i.hostid = h.hostid
WHERE i.status = 0 AND h.status = 0
GROUP BY type
""",
'data_freshness': """
SELECT
COUNT(CASE WHEN lastvalue_ts > UNIX_TIMESTAMP() - 3600 THEN 1 END) as updated_last_hour,
COUNT(CASE WHEN lastvalue_ts > UNIX_TIMESTAMP() - 86400 THEN 1 END) as updated_last_day,
COUNT(*) as total_items
FROM items i
JOIN hosts h ON i.hostid = h.hostid
WHERE i.status = 0 AND h.status = 0
""",
'history_size': """
SELECT
'history' as table_name,
COUNT(*) as record_count,
MIN(FROM_UNIXTIME(clock)) as oldest_record,
MAX(FROM_UNIXTIME(clock)) as newest_record
FROM history
UNION ALL
SELECT
'history_uint' as table_name,
COUNT(*) as record_count,
MIN(FROM_UNIXTIME(clock)) as oldest_record,
MAX(FROM_UNIXTIME(clock)) as newest_record
FROM history_uint
UNION ALL
SELECT
'history_str' as table_name,
COUNT(*) as record_count,
MIN(FROM_UNIXTIME(clock)) as oldest_record,
MAX(FROM_UNIXTIME(clock)) as newest_record
FROM history_str
"""
}
results = {}
try:
for query_name, query in internal_queries.items():
cursor.execute(query)
if query_name in ['host_count', 'item_count', 'trigger_count', 'active_problems']:
results[query_name] = cursor.fetchone()['count']
elif query_name == 'data_freshness':
results[query_name] = cursor.fetchone()
else:
results[query_name] = cursor.fetchall()
except Exception as e:
print(f"Zabbix internals analysis error: {e}")
return results
def identify_performance_bottlenecks(self, system_data: Dict, db_data: Dict, zabbix_data: Dict) -> List[Dict]:
"""性能ボトルネック特定"""
print("Identifying performance bottlenecks...")
bottlenecks = []
# CPU使用率チェック
if system_data.get('cpu', {}).get('usage_percent', 0) > 80:
bottlenecks.append({
'category': 'CPU',
'severity': 'high',
'description': f"High CPU usage: {system_data['cpu']['usage_percent']:.1f}%",
'recommendation': "Investigate CPU-intensive processes and optimize Zabbix configuration"
})
# メモリ使用率チェック
memory_usage = system_data.get('memory', {}).get('usage_percent', 0)
if memory_usage > 85:
bottlenecks.append({
'category': 'Memory',
'severity': 'high',
'description': f"High memory usage: {memory_usage:.1f}%",
'recommendation': "Check for memory leaks and optimize cache settings"
})
# データベース接続数チェック
active_connections = db_data.get('active_connections', 0)
max_connections = int(db_data.get('status_variables', {}).get('max_connections', 0))
if max_connections > 0 and active_connections / max_connections > 0.8:
bottlenecks.append({
'category': 'Database',
'severity': 'high',
'description': f"High database connection usage: {active_connections}/{max_connections}",
'recommendation': "Optimize database queries and connection pooling"
})
# スロークエリチェック
slow_queries = db_data.get('slow_queries', [])
if len(slow_queries) > 0:
bottlenecks.append({
'category': 'Database',
'severity': 'medium',
'description': f"Found {len(slow_queries)} slow queries",
'recommendation': "Optimize slow queries and add appropriate indexes"
})
# データ更新状況チェック
data_freshness = zabbix_data.get('data_freshness', {})
if data_freshness:
total_items = data_freshness.get('total_items', 0)
updated_last_hour = data_freshness.get('updated_last_hour', 0)
if total_items > 0 and updated_last_hour / total_items < 0.8:
bottlenecks.append({
'category': 'Data Collection',
'severity': 'medium',
'description': f"Low data freshness: {updated_last_hour}/{total_items} items updated in last hour",
'recommendation': "Check agent connectivity and polling configuration"
})
# 大きなテーブルサイズチェック
largest_tables = db_data.get('largest_tables', [])
for table in largest_tables[:3]: # 上位3テーブル
size_mb = table.get('size_mb', 0)
if size_mb > 10000: # 10GB以上
bottlenecks.append({
'category': 'Storage',
'severity': 'medium',
'description': f"Large table: {table['table_name']} ({size_mb:.1f} MB)",
'recommendation': "Consider data archiving and partitioning"
})
return bottlenecks
def generate_optimization_recommendations(self, bottlenecks: List[Dict]) -> Dict:
"""最適化推奨事項生成"""
print("Generating optimization recommendations...")
recommendations = {
'immediate_actions': [],
'configuration_changes': [],
'infrastructure_improvements': [],
'monitoring_enhancements': []
}
for bottleneck in bottlenecks:
category = bottleneck['category']
severity = bottleneck['severity']
if category == 'CPU' and severity == 'high':
recommendations['immediate_actions'].append(
"Review and optimize Zabbix server processes and polling intervals"
)
recommendations['configuration_changes'].append(
"Adjust StartPollers, StartPingers parameters in zabbix_server.conf"
)
elif category == 'Memory' and severity == 'high':
recommendations['immediate_actions'].append(
"Check for memory leaks in Zabbix processes"
)
recommendations['configuration_changes'].append(
"Optimize cache sizes: VMwareCacheSize, SNMPTrapperFile, etc."
)
elif category == 'Database':
recommendations['configuration_changes'].extend([
"Optimize MySQL/PostgreSQL configuration for Zabbix workload",
"Implement database partitioning for history tables",
"Add appropriate indexes for frequently queried columns"
])
recommendations['infrastructure_improvements'].append(
"Consider SSD storage for database files"
)
elif category == 'Data Collection':
recommendations['configuration_changes'].extend([
"Review and optimize item polling intervals",
"Implement bulk data collection where possible",
"Check agent connectivity and timeout settings"
])
elif category == 'Storage':
recommendations['immediate_actions'].append(
"Implement data housekeeping policies"
)
recommendations['infrastructure_improvements'].append(
"Plan storage capacity expansion"
)
# 監視改善提案
recommendations['monitoring_enhancements'].extend([
"Implement Zabbix self-monitoring templates",
"Set up performance baseline monitoring",
"Create capacity planning dashboards",
"Establish performance alerting thresholds"
])
return recommendations
def run_complete_analysis(self) -> Dict:
"""完全な性能分析実行"""
print("Starting comprehensive performance analysis...")
# データベース接続
if not self.connect_database():
return {'error': 'Database connection failed'}
try:
# 各種分析実行
system_data = self.analyze_system_performance()
db_data = self.analyze_database_performance()
zabbix_data = self.analyze_zabbix_internals()
# ボトルネック特定
bottlenecks = self.identify_performance_bottlenecks(system_data, db_data, zabbix_data)
# 最適化推奨事項生成
recommendations = self.generate_optimization_recommendations(bottlenecks)
# 結果統合
analysis_result = {
'timestamp': datetime.now().isoformat(),
'system_performance': system_data,
'database_performance': db_data,
'zabbix_internals': zabbix_data,
'bottlenecks': bottlenecks,
'recommendations': recommendations,
'summary': {
'total_bottlenecks': len(bottlenecks),
'high_severity_issues': len([b for b in bottlenecks if b['severity'] == 'high']),
'overall_status': 'critical' if len([b for b in bottlenecks if b['severity'] == 'high']) > 0 else 'warning' if bottlenecks else 'healthy'
}
}
return analysis_result
finally:
if self.connection:
self.connection.close()
def save_analysis_report(self, analysis_result: Dict, filename: str = None):
"""分析レポート保存"""
if not filename:
filename = f"zabbix_performance_analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(filename, 'w') as f:
json.dump(analysis_result, f, indent=2, default=str)
print(f"Analysis report saved to: {filename}")
return filename
# 使用例
def main():
# データベース設定
db_config = {
'host': 'localhost',
'database': 'zabbix',
'user': 'zabbix',
'password': 'zabbix_password'
}
# 性能分析実行
analyzer = ZabbixPerformanceAnalyzer(db_config)
result = analyzer.run_complete_analysis()
if 'error' in result:
print(f"Analysis failed: {result['error']}")
return
# レポート保存
report_file = analyzer.save_analysis_report(result)
# サマリー表示
print("\n" + "="*50)
print("PERFORMANCE ANALYSIS SUMMARY")
print("="*50)
summary = result['summary']
print(f"Overall Status: {summary['overall_status'].upper()}")
print(f"Total Issues: {summary['total_bottlenecks']}")
print(f"High Severity: {summary['high_severity_issues']}")
if result['bottlenecks']:
print("\nTop Issues:")
for i, bottleneck in enumerate(result['bottlenecks'][:5], 1):
print(f"{i}. [{bottleneck['category']}] {bottleneck['description']}")
print(f"\nDetailed report: {report_file}")
if __name__ == "__main__":
main()
ネットワーク問題
ネットワーク接続問題の診断
包括的ネットワーク診断
yaml
# ネットワーク問題分類
ネットワーク問題カテゴリ:
エージェント接続問題:
症状:
- "Host is unreachable"
- "Agent timeout"
- "Connection refused"
- "No route to host"
診断手順:
1. 基本接続確認 (ping)
2. ポート到達性確認 (telnet/nc)
3. ファイアウォール確認
4. エージェント状態確認
5. ネットワーク経路確認
一般的原因:
- ファイアウォール設定
- エージェント停止
- ネットワーク設定変更
- DNS解決問題
性能問題:
症状:
- データ収集遅延
- タイムアウトエラー頻発
- 不安定な接続
- パケットロス
診断項目:
- ネットワーク遅延 (RTT)
- 帯域使用率
- パケット損失率
- TCP接続統計
解決アプローチ:
- 監視間隔調整
- タイムアウト値調整
- 帯域制限実装
- QoS設定
セキュリティ制限:
症状:
- 暗号化接続失敗
- 認証エラー
- 証明書エラー
- アクセス拒否
確認項目:
- TLS/SSL設定
- 証明書有効性
- 暗号化設定
- アクセス制御設定
自動ネットワーク診断スクリプト
bash
#!/bin/bash
# Zabbixネットワーク診断・修復スクリプト
SCRIPT_NAME="zabbix-network-diagnostic"
LOG_FILE="/var/log/zabbix/network_diagnostic.log"
REPORT_FILE="/tmp/network_diagnostic_report.txt"
CONFIG_FILE="/etc/zabbix/zabbix_server.conf"
# ログ関数
log_diagnostic() {
local level="$1"
local message="$2"
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
echo "[$timestamp] [$level] $message" | tee -a "$LOG_FILE"
}
# ネットワーク基本診断
diagnose_basic_connectivity() {
log_diagnostic "INFO" "Starting basic network connectivity diagnosis..."
echo "# Network Connectivity Diagnosis Report" > "$REPORT_FILE"
echo "Generated: $(date)" >> "$REPORT_FILE"
echo "" >> "$REPORT_FILE"
# ローカルインターフェース確認
echo "## Local Network Interfaces" >> "$REPORT_FILE"
ip addr show | grep -E "^[0-9]|inet " >> "$REPORT_FILE"
echo "" >> "$REPORT_FILE"
# ルーティングテーブル確認
echo "## Routing Table" >> "$REPORT_FILE"
ip route show >> "$REPORT_FILE"
echo "" >> "$REPORT_FILE"
# DNS設定確認
echo "## DNS Configuration" >> "$REPORT_FILE"
cat /etc/resolv.conf >> "$REPORT_FILE"
echo "" >> "$REPORT_FILE"
# ポート使用状況確認
echo "## Zabbix Port Status" >> "$REPORT_FILE"
# Zabbixサーバーポート (10051)
if netstat -ln | grep -q ":10051.*LISTEN"; then
echo "✓ Zabbix Server Port 10051: LISTENING" >> "$REPORT_FILE"
log_diagnostic "SUCCESS" "Zabbix server port 10051 is listening"
else
echo "✗ Zabbix Server Port 10051: NOT LISTENING" >> "$REPORT_FILE"
log_diagnostic "ERROR" "Zabbix server port 10051 is not listening"
fi
# Zabbixエージェントポート (10050)
if netstat -ln | grep -q ":10050.*LISTEN"; then
echo "✓ Zabbix Agent Port 10050: LISTENING" >> "$REPORT_FILE"
log_diagnostic "SUCCESS" "Zabbix agent port 10050 is listening"
else
echo "⚠ Zabbix Agent Port 10050: NOT LISTENING" >> "$REPORT_FILE"
log_diagnostic "WARNING" "Zabbix agent port 10050 is not listening"
fi
echo "" >> "$REPORT_FILE"
}
# エージェント接続テスト
test_agent_connectivity() {
log_diagnostic "INFO" "Testing agent connectivity..."
echo "## Agent Connectivity Tests" >> "$REPORT_FILE"
# データベースからエージェントホスト一覧取得
local db_host=$(grep "^DBHost=" "$CONFIG_FILE" | cut -d'=' -f2)
local db_name=$(grep "^DBName=" "$CONFIG_FILE" | cut -d'=' -f2)
local db_user=$(grep "^DBUser=" "$CONFIG_FILE" | cut -d'=' -f2)
local db_password=$(grep "^DBPassword=" "$CONFIG_FILE" | cut -d'=' -f2)
if [ -n "$db_host" ] && [ -n "$db_name" ] && [ -n "$db_user" ]; then
# アクティブなホスト一覧取得
local hosts=$(mysql -h "$db_host" -u "$db_user" -p"$db_password" -e "
SELECT h.host, i.ip, i.port
FROM hosts h
JOIN interface i ON h.hostid = i.hostid
WHERE h.status = 0 AND i.type = 1
LIMIT 20;" -s -N "$db_name")
if [ -n "$hosts" ]; then
echo "### Active Host Connectivity Tests" >> "$REPORT_FILE"
while IFS=$'\t' read -r hostname ip port; do
test_single_host_connectivity "$hostname" "$ip" "$port"
done <<< "$hosts"
else
echo "No active hosts found in database" >> "$REPORT_FILE"
fi
else
echo "Database configuration not available for host testing" >> "$REPORT_FILE"
fi
echo "" >> "$REPORT_FILE"
}
# 単一ホスト接続テスト
test_single_host_connectivity() {
local hostname="$1"
local ip="$2"
local port="${3:-10050}"
echo "#### Testing: $hostname ($ip:$port)" >> "$REPORT_FILE"
# Ping テスト
if ping -c 3 -W 2 "$ip" >/dev/null 2>&1; then
echo " ✓ Ping: SUCCESS" >> "$REPORT_FILE"
else
echo " ✗ Ping: FAILED" >> "$REPORT_FILE"
log_diagnostic "WARNING" "Ping failed for $hostname ($ip)"
return 1
fi
# ポート接続テスト
if timeout 5 bash -c "</dev/tcp/$ip/$port" 2>/dev/null; then
echo " ✓ Port $port: ACCESSIBLE" >> "$REPORT_FILE"
else
echo " ✗ Port $port: NOT ACCESSIBLE" >> "$REPORT_FILE"
log_diagnostic "WARNING" "Port $port not accessible on $hostname ($ip)"
return 1
fi
# Zabbixエージェント応答テスト
if command -v zabbix_get >/dev/null; then
local agent_response=$(timeout 10 zabbix_get -s "$ip" -p "$port" -k "agent.ping" 2>/dev/null)
if [ "$agent_response" = "1" ]; then
echo " ✓ Agent Response: SUCCESS" >> "$REPORT_FILE"
else
echo " ✗ Agent Response: FAILED ($agent_response)" >> "$REPORT_FILE"
log_diagnostic "WARNING" "Agent response failed for $hostname ($ip)"
fi
else
echo " - Agent Response: zabbix_get not available" >> "$REPORT_FILE"
fi
echo "" >> "$REPORT_FILE"
}
# ファイアウォール診断
diagnose_firewall() {
log_diagnostic "INFO" "Diagnosing firewall configuration..."
echo "## Firewall Diagnosis" >> "$REPORT_FILE"
# firewalld確認
if systemctl is-active firewalld >/dev/null 2>&1; then
echo "### Firewalld Configuration" >> "$REPORT_FILE"
echo "Status: Active" >> "$REPORT_FILE"
# 現在のゾーン
local current_zone=$(firewall-cmd --get-default-zone 2>/dev/null)
echo "Default Zone: $current_zone" >> "$REPORT_FILE"
# 開放ポート確認
echo "Open Ports:" >> "$REPORT_FILE"
firewall-cmd --list-ports 2>/dev/null >> "$REPORT_FILE"
# サービス確認
echo "Allowed Services:" >> "$REPORT_FILE"
firewall-cmd --list-services 2>/dev/null >> "$REPORT_FILE"
# Zabbixポート確認
if firewall-cmd --query-port=10051/tcp >/dev/null 2>&1; then
echo "✓ Zabbix Server Port 10051: ALLOWED" >> "$REPORT_FILE"
else
echo "✗ Zabbix Server Port 10051: BLOCKED" >> "$REPORT_FILE"
log_diagnostic "WARNING" "Firewall blocking Zabbix server port 10051"
fi
if firewall-cmd --query-port=10050/tcp >/dev/null 2>&1; then
echo "✓ Zabbix Agent Port 10050: ALLOWED" >> "$REPORT_FILE"
else
echo "✗ Zabbix Agent Port 10050: BLOCKED" >> "$REPORT_FILE"
log_diagnostic "WARNING" "Firewall blocking Zabbix agent port 10050"
fi
# iptables確認
elif command -v iptables >/dev/null; then
echo "### Iptables Configuration" >> "$REPORT_FILE"
# INPUT チェーン確認
local input_rules=$(iptables -L INPUT -n | grep -E "(10050|10051)")
if [ -n "$input_rules" ]; then
echo "Zabbix Related Rules:" >> "$REPORT_FILE"
echo "$input_rules" >> "$REPORT_FILE"
else
echo "No specific Zabbix rules found in INPUT chain" >> "$REPORT_FILE"
# デフォルトポリシー確認
local default_policy=$(iptables -L INPUT | head -1 | grep -o "policy [A-Z]*" | cut -d' ' -f2)
echo "Default INPUT policy: $default_policy" >> "$REPORT_FILE"
if [ "$default_policy" = "DROP" ] || [ "$default_policy" = "REJECT" ]; then
log_diagnostic "WARNING" "Restrictive firewall policy may block Zabbix traffic"
fi
fi
else
echo "No recognized firewall system detected" >> "$REPORT_FILE"
fi
echo "" >> "$REPORT_FILE"
}
# ネットワーク性能診断
diagnose_network_performance() {
log_diagnostic "INFO" "Diagnosing network performance..."
echo "## Network Performance Analysis" >> "$REPORT_FILE"
# ネットワーク統計
echo "### Network Interface Statistics" >> "$REPORT_FILE"
cat /proc/net/dev | head -2 >> "$REPORT_FILE"
cat /proc/net/dev | grep -E "(eth|ens|enp)" >> "$REPORT_FILE"
echo "" >> "$REPORT_FILE"
# TCP接続統計
echo "### TCP Connection Statistics" >> "$REPORT_FILE"
ss -s >> "$REPORT_FILE"
echo "" >> "$REPORT_FILE"
# アクティブなZabbix接続
echo "### Active Zabbix Connections" >> "$REPORT_FILE"
ss -tuln | grep -E "(10050|10051)" >> "$REPORT_FILE"
echo "### Established Connections to Zabbix Ports" >> "$REPORT_FILE"
ss -tuna | grep -E "(10050|10051)" | grep ESTAB | wc -l | \
xargs echo "Established connections:" >> "$REPORT_FILE"
echo "" >> "$REPORT_FILE"
}
# DNS解決確認
test_dns_resolution() {
log_diagnostic "INFO" "Testing DNS resolution..."
echo "## DNS Resolution Tests" >> "$REPORT_FILE"
# データベースからホスト名一覧取得
local db_host=$(grep "^DBHost=" "$CONFIG_FILE" | cut -d'=' -f2)
local db_name=$(grep "^DBName=" "$CONFIG_FILE" | cut -d'=' -f2)
local db_user=$(grep "^DBUser=" "$CONFIG_FILE" | cut -d'=' -f2)
local db_password=$(grep "^DBPassword=" "$CONFIG_FILE" | cut -d'=' -f2)
if [ -n "$db_host" ] && [ -n "$db_name" ] && [ -n "$db_user" ]; then
# DNS名を使用しているホスト確認
local dns_hosts=$(mysql -h "$db_host" -u "$db_user" -p"$db_password" -e "
SELECT DISTINCT i.dns
FROM interface i
JOIN hosts h ON i.hostid = h.hostid
WHERE h.status = 0 AND i.type = 1 AND i.dns != ''
LIMIT 10;" -s -N "$db_name")
if [ -n "$dns_hosts" ]; then
echo "### DNS Resolution Tests for Monitored Hosts" >> "$REPORT_FILE"
while read -r dns_name; do
if [ -n "$dns_name" ]; then
local resolved_ip=$(dig +short "$dns_name" 2>/dev/null | head -1)
if [ -n "$resolved_ip" ]; then
echo "✓ $dns_name -> $resolved_ip" >> "$REPORT_FILE"
else
echo "✗ $dns_name -> RESOLUTION FAILED" >> "$REPORT_FILE"
log_diagnostic "WARNING" "DNS resolution failed for $dns_name"
fi
fi
done <<< "$dns_hosts"
else
echo "No DNS-based hosts found" >> "$REPORT_FILE"
fi
fi
echo "" >> "$REPORT_FILE"
}
# 自動修復機能
auto_fix_network_issues() {
log_diagnostic "INFO" "Attempting automatic network issue fixes..."
echo "## Automatic Fix Attempts" >> "$REPORT_FILE"
local fixes_applied=0
# ファイアウォール自動設定
if systemctl is-active firewalld >/dev/null 2>&1; then
# Zabbixサーバーポート開放
if ! firewall-cmd --query-port=10051/tcp >/dev/null 2>&1; then
if firewall-cmd --permanent --add-port=10051/tcp >/dev/null 2>&1; then
firewall-cmd --reload >/dev/null 2>&1
echo "✓ Opened Zabbix server port 10051 in firewall" >> "$REPORT_FILE"
log_diagnostic "SUCCESS" "Opened firewall port 10051"
fixes_applied=$((fixes_applied + 1))
else
echo "✗ Failed to open Zabbix server port 10051" >> "$REPORT_FILE"
log_diagnostic "ERROR" "Failed to open firewall port 10051"
fi
fi
# Zabbixエージェントポート開放
if ! firewall-cmd --query-port=10050/tcp >/dev/null 2>&1; then
if firewall-cmd --permanent --add-port=10050/tcp >/dev/null 2>&1; then
firewall-cmd --reload >/dev/null 2>&1
echo "✓ Opened Zabbix agent port 10050 in firewall" >> "$REPORT_FILE"
log_diagnostic "SUCCESS" "Opened firewall port 10050"
fixes_applied=$((fixes_applied + 1))
else
echo "✗ Failed to open Zabbix agent port 10050" >> "$REPORT_FILE"
log_diagnostic "ERROR" "Failed to open firewall port 10050"
fi
fi
fi
# Zabbixエージェント起動確認・修復
if ! systemctl is-active zabbix-agent >/dev/null 2>&1; then
if systemctl start zabbix-agent >/dev/null 2>&1; then
echo "✓ Started Zabbix agent service" >> "$REPORT_FILE"
log_diagnostic "SUCCESS" "Started Zabbix agent service"
fixes_applied=$((fixes_applied + 1))
else
echo "✗ Failed to start Zabbix agent service" >> "$REPORT_FILE"
log_diagnostic "ERROR" "Failed to start Zabbix agent service"
fi
fi
# ネットワーク設定確認・修復
# DNS設定確認
if ! grep -q "nameserver" /etc/resolv.conf; then
echo "nameserver 8.8.8.8" >> /etc/resolv.conf
echo "nameserver 8.8.4.4" >> /etc/resolv.conf
echo "✓ Added fallback DNS servers" >> "$REPORT_FILE"
log_diagnostic "SUCCESS" "Added fallback DNS servers"
fixes_applied=$((fixes_applied + 1))
fi
echo "" >> "$REPORT_FILE"
echo "Total automatic fixes applied: $fixes_applied" >> "$REPORT_FILE"
if [ $fixes_applied -gt 0 ]; then
echo "⚠ Please test connectivity after automatic fixes" >> "$REPORT_FILE"
log_diagnostic "INFO" "Applied $fixes_applied automatic fixes"
fi
echo "" >> "$REPORT_FILE"
}
# 推奨事項生成
generate_recommendations() {
log_diagnostic "INFO" "Generating network recommendations..."
echo "## Recommendations" >> "$REPORT_FILE"
# 一般的な推奨事項
echo "### General Network Recommendations" >> "$REPORT_FILE"
echo "1. Regularly monitor network connectivity to all monitored hosts" >> "$REPORT_FILE"
echo "2. Implement network redundancy for critical monitoring paths" >> "$REPORT_FILE"
echo "3. Configure appropriate timeouts based on network characteristics" >> "$REPORT_FILE"
echo "4. Use encrypted connections (PSK/certificates) for secure environments" >> "$REPORT_FILE"
echo "5. Monitor network performance metrics (latency, packet loss)" >> "$REPORT_FILE"
echo "" >> "$REPORT_FILE"
# 自動化推奨事項
echo "### Automation Recommendations" >> "$REPORT_FILE"
echo "1. Schedule regular network diagnostics" >> "$REPORT_FILE"
echo "2. Implement automated agent health monitoring" >> "$REPORT_FILE"
echo "3. Set up network path monitoring" >> "$REPORT_FILE"
echo "4. Create network troubleshooting runbooks" >> "$REPORT_FILE"
echo "" >> "$REPORT_FILE"
# スクリプト使用方法
echo "### Script Usage" >> "$REPORT_FILE"
echo "Run this script regularly:" >> "$REPORT_FILE"
echo "- Daily: $0 quick" >> "$REPORT_FILE"
echo "- Weekly: $0 full" >> "$REPORT_FILE"
echo "- After changes: $0 full --fix" >> "$REPORT_FILE"
echo "" >> "$REPORT_FILE"
}
# メイン実行関数
main() {
local mode="${1:-full}"
local auto_fix="${2:-}"
echo "Zabbix Network Diagnostic Tool"
echo "==============================="
log_diagnostic "INFO" "Starting network diagnostic (mode: $mode)..."
case "$mode" in
"quick")
diagnose_basic_connectivity
test_agent_connectivity
;;
"firewall")
diagnose_firewall
;;
"performance")
diagnose_network_performance
;;
"dns")
test_dns_resolution
;;
"full"|*)
diagnose_basic_connectivity
test_agent_connectivity
diagnose_firewall
diagnose_network_performance
test_dns_resolution
;;
esac
# 自動修復実行(オプション)
if [ "$auto_fix" = "--fix" ]; then
auto_fix_network_issues
fi
generate_recommendations
echo ""
echo "Network diagnostic completed."
echo "Report saved to: $REPORT_FILE"
echo "Log file: $LOG_FILE"
# 重要な警告表示
if grep -q "✗" "$REPORT_FILE"; then
echo ""
echo "⚠ WARNING: Network issues detected!"
echo "Please review the report for details."
fi
}
# ヘルプ表示
show_help() {
echo "Zabbix Network Diagnostic Tool"
echo ""
echo "USAGE:"
echo " $0 [mode] [options]"
echo ""
echo "MODES:"
echo " full Complete network diagnosis (default)"
echo " quick Basic connectivity check"
echo " firewall Firewall configuration check"
echo " performance Network performance analysis"
echo " dns DNS resolution testing"
echo ""
echo "OPTIONS:"
echo " --fix Attempt automatic fixes"
echo " --help Show this help"
echo ""
echo "EXAMPLES:"
echo " $0 # Full network diagnosis"
echo " $0 quick # Quick connectivity check"
echo " $0 full --fix # Full diagnosis with auto-fix"
}
# スクリプト実行
if [ "${1:-}" = "--help" ] || [ "${1:-}" = "-h" ]; then
show_help
exit 0
fi
main "$@"
データベース問題
データベース問題の診断と解決
データベース健康診断システム
sql
-- Zabbixデータベース診断SQLクエリ集
-- 1. データベース基本情報
SELECT
'Database Size' as metric,
ROUND(SUM(data_length + index_length) / 1024 / 1024 / 1024, 2) AS 'Value (GB)'
FROM information_schema.tables
WHERE table_schema = 'zabbix'
UNION ALL
SELECT
'Total Tables' as metric,
COUNT(*) AS 'Value (GB)'
FROM information_schema.tables
WHERE table_schema = 'zabbix';
-- 2. 大きなテーブルTop 10
SELECT
table_name,
ROUND(((data_length + index_length) / 1024 / 1024), 2) AS 'Size (MB)',
table_rows,
ROUND((index_length / (data_length + index_length)) * 100, 2) AS 'Index Ratio (%)'
FROM information_schema.TABLES
WHERE table_schema = 'zabbix'
ORDER BY (data_length + index_length) DESC
LIMIT 10;
-- 3. 履歴テーブル統計
SELECT
'history' as table_name,
COUNT(*) as record_count,
MIN(FROM_UNIXTIME(clock)) as oldest_record,
MAX(FROM_UNIXTIME(clock)) as newest_record,
ROUND((COUNT(*) * 24) / 1024 / 1024, 2) as 'Est Size (MB)'
FROM history
UNION ALL
SELECT
'history_uint' as table_name,
COUNT(*) as record_count,
MIN(FROM_UNIXTIME(clock)) as oldest_record,
MAX(FROM_UNIXTIME(clock)) as newest_record,
ROUND((COUNT(*) * 20) / 1024 / 1024, 2) as 'Est Size (MB)'
FROM history_uint
UNION ALL
SELECT
'history_str' as table_name,
COUNT(*) as record_count,
MIN(FROM_UNIXTIME(clock)) as oldest_record,
MAX(FROM_UNIXTIME(clock)) as newest_record,
ROUND((COUNT(*) * 255) / 1024 / 1024, 2) as 'Est Size (MB)'
FROM history_str;
-- 4. パフォーマンス統計
SHOW STATUS LIKE 'Connections';
SHOW STATUS LIKE 'Threads_connected';
SHOW STATUS LIKE 'Questions';
SHOW STATUS LIKE 'Slow_queries';
SHOW STATUS LIKE 'Innodb_buffer_pool_read_requests';
SHOW STATUS LIKE 'Innodb_buffer_pool_reads';
-- 5. インデックス効率性確認
SELECT
s.table_name,
s.index_name,
s.cardinality,
ROUND((s.cardinality / t.table_rows) * 100, 2) as selectivity_percent
FROM information_schema.statistics s
JOIN information_schema.tables t ON s.table_schema = t.table_schema
AND s.table_name = t.table_name
WHERE s.table_schema = 'zabbix'
AND s.cardinality IS NOT NULL
AND t.table_rows > 1000
ORDER BY selectivity_percent DESC;
-- 6. 設定確認
SHOW VARIABLES LIKE 'innodb_buffer_pool_size';
SHOW VARIABLES LIKE 'innodb_log_file_size';
SHOW VARIABLES LIKE 'query_cache_size';
SHOW VARIABLES LIKE 'max_connections';
SHOW VARIABLES LIKE 'tmp_table_size';
SHOW VARIABLES LIKE 'max_heap_table_size';
まとめ
体系的なZabbixトラブルシューティングアプローチにより、問題の迅速な特定・解決と安定した監視環境の維持が可能となります。
重要ポイント
- 体系的診断: 構造化されたアプローチによる効率的な問題特定
- 自動化: スクリプトとツールによる診断・解決の自動化
- 予防的対策: 問題発生前の兆候検知と対処
- 知識蓄積: トラブルシューティング事例の文書化と共有
次のステップ
次章では、実践的なシナリオ事例について学習し、様々な環境での実装パターンを習得します。