8.3 トラブルシューティング

Zabbix環境で発生する一般的な問題の特定・分析・解決手法の体系的アプローチ

概要

Zabbixトラブルシューティングは、監視システムで発生する様々な問題を迅速かつ的確に解決するための重要なスキルです。体系的な問題分析アプローチと豊富な解決事例により、サービス品質の維持と運用効率の向上を実現できます。

トラブルシューティングの重要性

要素影響範囲対応効果
迅速な問題特定サービス停止時間短縮可用性向上・損失削減
根本原因分析再発防止・品質向上長期安定性・信頼性
予防的対策障害未然防止運用コスト削減
知識蓄積技術力向上・標準化組織能力・効率性
自動化人的ミス削減運用品質・一貫性

問題分析の体系的アプローチ

ITIL準拠問題管理プロセス

問題分類フレームワーク

yaml
# 問題分類体系
問題カテゴリ:
  レベル1 - サービス影響:
    Critical: "全サービス停止"
    High: "主要機能影響"
    Medium: "部分機能影響"
    Low: "軽微な問題"
    
  レベル2 - 技術領域:
    Infrastructure: "OS・ハードウェア"
    Application: "Zabbix本体"
    Database: "データベース関連"
    Network: "ネットワーク・通信"
    Security: "セキュリティ・認証"
    Performance: "性能・レスポンス"
    
  レベル3 - 具体的症状:
    Connection: "接続問題"
    Data: "データ関連"
    Configuration: "設定問題"
    Resource: "リソース不足"
    Bug: "ソフトウェア欠陥"
    Change: "変更起因"

# 問題解決プロセス
解決プロセス:
  Step 1 - 初期対応:
    - 症状確認・記録
    - 影響範囲特定
    - 緊急回避策実施
    - ステークホルダー通知
    
  Step 2 - 問題分析:
    - ログ収集・分析
    - 環境情報収集
    - 再現テスト実施
    - 根本原因特定
    
  Step 3 - 解決実装:
    - 解決策設計
    - テスト環境検証
    - 本番環境適用
    - 効果確認
    
  Step 4 - 事後対応:
    - 再発防止策実装
    - ドキュメント更新
    - 知識ベース登録
    - プロセス改善

診断ツールセット

統合診断スクリプト

bash
#!/bin/bash
# Zabbix統合診断スクリプト

SCRIPT_NAME="zabbix-diagnostic"
VERSION="2.1"
LOG_FILE="/var/log/zabbix/diagnostic_$(date +%Y%m%d_%H%M%S).log"
REPORT_FILE="/tmp/zabbix_diagnostic_report.txt"
CONFIG_FILE="/etc/zabbix/zabbix_server.conf"

# カラー出力設定
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color

# ログ関数
log_message() {
    local level="$1"
    local message="$2"
    local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
    
    echo "[$timestamp] [$level] $message" | tee -a "$LOG_FILE"
    
    case "$level" in
        "ERROR")   echo -e "${RED}✗ $message${NC}" ;;
        "WARNING") echo -e "${YELLOW}⚠ $message${NC}" ;;
        "SUCCESS") echo -e "${GREEN}✓ $message${NC}" ;;
        "INFO")    echo -e "${BLUE}ℹ $message${NC}" ;;
        *)         echo "$message" ;;
    esac
}

# システム情報収集
collect_system_info() {
    log_message "INFO" "Collecting system information..."
    
    cat > "$REPORT_FILE" << EOF
# Zabbix Diagnostic Report

Generated: $(date)
Hostname: $(hostname)
OS: $(cat /etc/os-release | grep PRETTY_NAME | cut -d'"' -f2)
Kernel: $(uname -r)
Uptime: $(uptime)

## System Resources

### CPU Information
$(lscpu | grep -E "(Architecture|CPU|Thread|Socket)")

### Memory Information
$(free -h)

### Disk Usage
$(df -h | grep -E '^/dev/')

### Network Interfaces
$(ip addr show | grep -E '^[0-9]|inet ')

EOF
}

# Zabbixプロセス状況確認
check_zabbix_processes() {
    log_message "INFO" "Checking Zabbix processes..."
    
    echo "## Zabbix Process Status" >> "$REPORT_FILE"
    
    # Zabbixサーバープロセス確認
    if pgrep -f zabbix_server > /dev/null; then
        log_message "SUCCESS" "Zabbix server process is running"
        
        local server_pid=$(pgrep -f zabbix_server)
        local server_threads=$(ps -o thcount -p "$server_pid" | tail -1)
        
        echo "### Zabbix Server" >> "$REPORT_FILE"
        echo "- Status: Running" >> "$REPORT_FILE"
        echo "- PID: $server_pid" >> "$REPORT_FILE"
        echo "- Threads: $server_threads" >> "$REPORT_FILE"
        
        # プロセス詳細情報
        echo "- Process Details:" >> "$REPORT_FILE"
        ps aux | grep zabbix_server | grep -v grep >> "$REPORT_FILE"
        
    else
        log_message "ERROR" "Zabbix server process is not running"
        echo "### Zabbix Server" >> "$REPORT_FILE"
        echo "- Status: NOT RUNNING" >> "$REPORT_FILE"
        
        # サービス状況確認
        if systemctl is-enabled zabbix-server >/dev/null 2>&1; then
            local service_status=$(systemctl is-active zabbix-server)
            echo "- Service Status: $service_status" >> "$REPORT_FILE"
        fi
    fi
    
    # Zabbixエージェントプロセス確認
    if pgrep -f zabbix_agentd > /dev/null; then
        log_message "SUCCESS" "Zabbix agent process is running"
        echo "### Zabbix Agent" >> "$REPORT_FILE"
        echo "- Status: Running" >> "$REPORT_FILE"
        ps aux | grep zabbix_agentd | grep -v grep >> "$REPORT_FILE"
    else
        log_message "WARNING" "Zabbix agent process is not running"
        echo "### Zabbix Agent" >> "$REPORT_FILE"
        echo "- Status: NOT RUNNING" >> "$REPORT_FILE"
    fi
}

# 設定ファイル検証
verify_configuration() {
    log_message "INFO" "Verifying Zabbix configuration..."
    
    echo "## Configuration Verification" >> "$REPORT_FILE"
    
    # Zabbixサーバー設定検証
    if [ -f "$CONFIG_FILE" ]; then
        log_message "INFO" "Found Zabbix server configuration file"
        
        # 設定テスト実行
        local config_test=$(zabbix_server -t 2>&1)
        local test_result=$?
        
        echo "### Server Configuration Test" >> "$REPORT_FILE"
        if [ $test_result -eq 0 ]; then
            log_message "SUCCESS" "Zabbix server configuration is valid"
            echo "- Result: PASSED" >> "$REPORT_FILE"
        else
            log_message "ERROR" "Zabbix server configuration has errors"
            echo "- Result: FAILED" >> "$REPORT_FILE"
            echo "- Errors:" >> "$REPORT_FILE"
            echo "$config_test" >> "$REPORT_FILE"
        fi
        
        # 重要設定項目確認
        echo "### Key Configuration Parameters" >> "$REPORT_FILE"
        grep -E "^(DBHost|DBName|DBUser|LogFile|PidFile|ListenPort|StartPollers)" "$CONFIG_FILE" >> "$REPORT_FILE"
        
    else
        log_message "ERROR" "Zabbix server configuration file not found: $CONFIG_FILE"
        echo "### Server Configuration" >> "$REPORT_FILE"
        echo "- Status: Configuration file not found" >> "$REPORT_FILE"
    fi
    
    # エージェント設定確認
    local agent_config="/etc/zabbix/zabbix_agentd.conf"
    if [ -f "$agent_config" ]; then
        echo "### Agent Configuration" >> "$REPORT_FILE"
        grep -E "^(Server|ServerActive|Hostname|ListenPort)" "$agent_config" >> "$REPORT_FILE"
    fi
}

# データベース接続確認
check_database_connectivity() {
    log_message "INFO" "Checking database connectivity..."
    
    echo "## Database Connectivity" >> "$REPORT_FILE"
    
    # 設定からDB情報抽出
    local db_host=$(grep "^DBHost=" "$CONFIG_FILE" | cut -d'=' -f2)
    local db_name=$(grep "^DBName=" "$CONFIG_FILE" | cut -d'=' -f2)
    local db_user=$(grep "^DBUser=" "$CONFIG_FILE" | cut -d'=' -f2)
    local db_password=$(grep "^DBPassword=" "$CONFIG_FILE" | cut -d'=' -f2)
    
    if [ -n "$db_host" ] && [ -n "$db_name" ] && [ -n "$db_user" ]; then
        # データベース接続テスト
        if mysql -h "$db_host" -u "$db_user" -p"$db_password" -e "SELECT 1;" "$db_name" >/dev/null 2>&1; then
            log_message "SUCCESS" "Database connection successful"
            echo "- Connection: SUCCESS" >> "$REPORT_FILE"
            
            # データベース統計情報
            echo "### Database Statistics" >> "$REPORT_FILE"
            mysql -h "$db_host" -u "$db_user" -p"$db_password" -e "
                SELECT 
                    ROUND(SUM(data_length + index_length) / 1024 / 1024, 2) AS 'DB Size (MB)',
                    COUNT(*) AS 'Total Tables'
                FROM information_schema.tables 
                WHERE table_schema='$db_name';" "$db_name" >> "$REPORT_FILE"
            
            # 接続数情報
            echo "### Connection Information" >> "$REPORT_FILE"
            mysql -h "$db_host" -u "$db_user" -p"$db_password" -e "
                SHOW STATUS LIKE 'Threads_connected';
                SHOW STATUS LIKE 'Max_used_connections';
                SHOW VARIABLES LIKE 'max_connections';" >> "$REPORT_FILE"
            
        else
            log_message "ERROR" "Database connection failed"
            echo "- Connection: FAILED" >> "$REPORT_FILE"
        fi
    else
        log_message "ERROR" "Database configuration parameters not found"
        echo "- Configuration: INCOMPLETE" >> "$REPORT_FILE"
    fi
}

# ネットワーク接続確認
check_network_connectivity() {
    log_message "INFO" "Checking network connectivity..."
    
    echo "## Network Connectivity" >> "$REPORT_FILE"
    
    # ポート使用状況確認
    echo "### Port Usage" >> "$REPORT_FILE"
    
    # Zabbixサーバーポート確認
    if netstat -ln | grep -q ":10051.*LISTEN"; then
        log_message "SUCCESS" "Zabbix server port 10051 is listening"
        echo "- Server Port 10051: LISTENING" >> "$REPORT_FILE"
    else
        log_message "ERROR" "Zabbix server port 10051 is not listening"
        echo "- Server Port 10051: NOT LISTENING" >> "$REPORT_FILE"
    fi
    
    # Zabbixエージェントポート確認
    if netstat -ln | grep -q ":10050.*LISTEN"; then
        log_message "SUCCESS" "Zabbix agent port 10050 is listening"
        echo "- Agent Port 10050: LISTENING" >> "$REPORT_FILE"
    else
        log_message "WARNING" "Zabbix agent port 10050 is not listening"
        echo "- Agent Port 10050: NOT LISTENING" >> "$REPORT_FILE"
    fi
    
    # 全ポート使用状況
    echo "### Active Zabbix Ports" >> "$REPORT_FILE"
    netstat -ln | grep -E ":(10050|10051)" >> "$REPORT_FILE"
    
    # エージェント接続テスト
    echo "### Agent Connectivity Test" >> "$REPORT_FILE"
    if command -v zabbix_get >/dev/null; then
        local agent_test=$(zabbix_get -s localhost -k agent.ping 2>&1)
        if [ "$agent_test" = "1" ]; then
            log_message "SUCCESS" "Local agent connectivity test passed"
            echo "- Local Agent Test: SUCCESS" >> "$REPORT_FILE"
        else
            log_message "WARNING" "Local agent connectivity test failed"
            echo "- Local Agent Test: FAILED" >> "$REPORT_FILE"
            echo "- Error: $agent_test" >> "$REPORT_FILE"
        fi
    else
        log_message "WARNING" "zabbix_get command not available"
        echo "- Agent Test: TOOL NOT AVAILABLE" >> "$REPORT_FILE"
    fi
}

# ログファイル解析
analyze_log_files() {
    log_message "INFO" "Analyzing Zabbix log files..."
    
    echo "## Log File Analysis" >> "$REPORT_FILE"
    
    # Zabbixサーバーログ解析
    local server_log="/var/log/zabbix/zabbix_server.log"
    if [ -f "$server_log" ]; then
        echo "### Server Log Analysis" >> "$REPORT_FILE"
        
        # 最近のエラー
        echo "#### Recent Errors (Last 24 hours)" >> "$REPORT_FILE"
        grep -i error "$server_log" | tail -20 >> "$REPORT_FILE"
        
        # 警告メッセージ
        echo "#### Recent Warnings (Last 100 lines)" >> "$REPORT_FILE"
        grep -i warning "$server_log" | tail -10 >> "$REPORT_FILE"
        
        # データベース関連エラー
        echo "#### Database Related Issues" >> "$REPORT_FILE"
        grep -i -E "(database|mysql|postgresql|connection)" "$server_log" | tail -10 >> "$REPORT_FILE"
        
        # 統計情報
        echo "#### Error Statistics (Last 1000 lines)" >> "$REPORT_FILE"
        tail -1000 "$server_log" | grep -c -i error > /tmp/error_count
        tail -1000 "$server_log" | grep -c -i warning > /tmp/warning_count
        echo "- Errors: $(cat /tmp/error_count)" >> "$REPORT_FILE"
        echo "- Warnings: $(cat /tmp/warning_count)" >> "$REPORT_FILE"
        
    else
        log_message "WARNING" "Zabbix server log file not found: $server_log"
        echo "### Server Log" >> "$REPORT_FILE"
        echo "- Status: LOG FILE NOT FOUND" >> "$REPORT_FILE"
    fi
    
    # システムログ確認
    echo "### System Log Analysis" >> "$REPORT_FILE"
    echo "#### Recent Zabbix Related System Messages" >> "$REPORT_FILE"
    journalctl -u zabbix-server --since "1 hour ago" --no-pager | tail -20 >> "$REPORT_FILE"
}

# パフォーマンス分析
analyze_performance() {
    log_message "INFO" "Analyzing system performance..."
    
    echo "## Performance Analysis" >> "$REPORT_FILE"
    
    # システムリソース使用状況
    echo "### Resource Utilization" >> "$REPORT_FILE"
    
    # CPU使用率
    local cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | sed 's/%us,//')
    echo "- CPU Usage: ${cpu_usage}%" >> "$REPORT_FILE"
    
    # メモリ使用率
    local mem_info=$(free | grep Mem)
    local total_mem=$(echo $mem_info | awk '{print $2}')
    local used_mem=$(echo $mem_info | awk '{print $3}')
    local mem_usage=$(echo "scale=2; $used_mem * 100 / $total_mem" | bc)
    echo "- Memory Usage: ${mem_usage}%" >> "$REPORT_FILE"
    
    # ディスクI/O
    echo "### Disk I/O Statistics" >> "$REPORT_FILE"
    if command -v iostat >/dev/null; then
        iostat -x 1 3 | tail -10 >> "$REPORT_FILE"
    else
        echo "- iostat not available" >> "$REPORT_FILE"
    fi
    
    # Zabbixプロセス性能
    echo "### Zabbix Process Performance" >> "$REPORT_FILE"
    if pgrep -f zabbix_server > /dev/null; then
        local server_pid=$(pgrep -f zabbix_server)
        echo "#### Server Process Resource Usage" >> "$REPORT_FILE"
        ps -p "$server_pid" -o pid,pcpu,pmem,vsz,rss,etime,cmd >> "$REPORT_FILE"
        
        # プロセス別CPU使用率
        echo "#### Zabbix Process Details" >> "$REPORT_FILE"
        ps aux | grep zabbix | grep -v grep | sort -k3 -nr >> "$REPORT_FILE"
    fi
}

# 問題の特定と推奨事項
identify_issues_and_recommendations() {
    log_message "INFO" "Identifying issues and generating recommendations..."
    
    echo "## Issues Identified and Recommendations" >> "$REPORT_FILE"
    
    local issues=0
    
    # Zabbixサーバープロセス確認
    if ! pgrep -f zabbix_server > /dev/null; then
        echo "### 🚨 CRITICAL: Zabbix Server Not Running" >> "$REPORT_FILE"
        echo "**Recommendation**: Start Zabbix server service immediately" >> "$REPORT_FILE"
        echo "\`\`\`bash" >> "$REPORT_FILE"
        echo "systemctl start zabbix-server" >> "$REPORT_FILE"
        echo "systemctl status zabbix-server" >> "$REPORT_FILE"
        echo "\`\`\`" >> "$REPORT_FILE"
        echo "" >> "$REPORT_FILE"
        issues=$((issues + 1))
    fi
    
    # データベース接続確認
    local db_host=$(grep "^DBHost=" "$CONFIG_FILE" | cut -d'=' -f2)
    local db_name=$(grep "^DBName=" "$CONFIG_FILE" | cut -d'=' -f2)
    local db_user=$(grep "^DBUser=" "$CONFIG_FILE" | cut -d'=' -f2)
    local db_password=$(grep "^DBPassword=" "$CONFIG_FILE" | cut -d'=' -f2)
    
    if [ -n "$db_host" ] && [ -n "$db_name" ] && [ -n "$db_user" ]; then
        if ! mysql -h "$db_host" -u "$db_user" -p"$db_password" -e "SELECT 1;" "$db_name" >/dev/null 2>&1; then
            echo "### 🚨 CRITICAL: Database Connection Failed" >> "$REPORT_FILE"
            echo "**Recommendation**: Check database connectivity and credentials" >> "$REPORT_FILE"
            echo "- Verify database server is running" >> "$REPORT_FILE"
            echo "- Check network connectivity to database" >> "$REPORT_FILE"
            echo "- Verify database credentials in configuration" >> "$REPORT_FILE"
            echo "" >> "$REPORT_FILE"
            issues=$((issues + 1))
        fi
    fi
    
    # ポート確認
    if ! netstat -ln | grep -q ":10051.*LISTEN"; then
        echo "### ⚠️ WARNING: Zabbix Server Port Not Listening" >> "$REPORT_FILE"
        echo "**Recommendation**: Check Zabbix server configuration and firewall" >> "$REPORT_FILE"
        echo "\`\`\`bash" >> "$REPORT_FILE"
        echo "# Check if port is configured correctly" >> "$REPORT_FILE"
        echo "grep ListenPort /etc/zabbix/zabbix_server.conf" >> "$REPORT_FILE"
        echo "# Check firewall rules" >> "$REPORT_FILE"
        echo "firewall-cmd --list-ports" >> "$REPORT_FILE"
        echo "\`\`\`" >> "$REPORT_FILE"
        echo "" >> "$REPORT_FILE"
        issues=$((issues + 1))
    fi
    
    # リソース使用量確認
    local cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | sed 's/%us,//')
    if (( $(echo "$cpu_usage > 80" | bc -l) )); then
        echo "### ⚠️ WARNING: High CPU Usage" >> "$REPORT_FILE"
        echo "**Current CPU Usage**: ${cpu_usage}%" >> "$REPORT_FILE"
        echo "**Recommendation**: Investigate high CPU usage" >> "$REPORT_FILE"
        echo "- Check for runaway processes" >> "$REPORT_FILE"
        echo "- Review Zabbix configuration for optimization" >> "$REPORT_FILE"
        echo "- Consider scaling resources" >> "$REPORT_FILE"
        echo "" >> "$REPORT_FILE"
        issues=$((issues + 1))
    fi
    
    # メモリ使用量確認
    local mem_info=$(free | grep Mem)
    local total_mem=$(echo $mem_info | awk '{print $2}')
    local used_mem=$(echo $mem_info | awk '{print $3}')
    local mem_usage=$(echo "scale=2; $used_mem * 100 / $total_mem" | bc)
    
    if (( $(echo "$mem_usage > 85" | bc -l) )); then
        echo "### ⚠️ WARNING: High Memory Usage" >> "$REPORT_FILE"
        echo "**Current Memory Usage**: ${mem_usage}%" >> "$REPORT_FILE"
        echo "**Recommendation**: Investigate memory usage" >> "$REPORT_FILE"
        echo "- Check for memory leaks" >> "$REPORT_FILE"
        echo "- Review Zabbix cache settings" >> "$REPORT_FILE"
        echo "- Consider adding more RAM" >> "$REPORT_FILE"
        echo "" >> "$REPORT_FILE"
        issues=$((issues + 1))
    fi
    
    # ディスク使用量確認
    local disk_usage=$(df / | awk 'NR==2 {print $5}' | sed 's/%//')
    if [ "$disk_usage" -gt 90 ]; then
        echo "### 🚨 CRITICAL: Low Disk Space" >> "$REPORT_FILE"
        echo "**Current Disk Usage**: ${disk_usage}%" >> "$REPORT_FILE"
        echo "**Recommendation**: Free up disk space immediately" >> "$REPORT_FILE"
        echo "- Clean up old log files" >> "$REPORT_FILE"
        echo "- Archive old monitoring data" >> "$REPORT_FILE"
        echo "- Check for large temporary files" >> "$REPORT_FILE"
        echo "" >> "$REPORT_FILE"
        issues=$((issues + 1))
    fi
    
    # 総合評価
    echo "### Summary" >> "$REPORT_FILE"
    if [ $issues -eq 0 ]; then
        echo "✅ **Overall Status**: HEALTHY" >> "$REPORT_FILE"
        echo "No critical issues detected. System appears to be running normally." >> "$REPORT_FILE"
        log_message "SUCCESS" "No critical issues detected"
    elif [ $issues -le 2 ]; then
        echo "⚠️ **Overall Status**: ATTENTION REQUIRED" >> "$REPORT_FILE"
        echo "Some issues detected that require attention. Please review recommendations above." >> "$REPORT_FILE"
        log_message "WARNING" "$issues issues detected requiring attention"
    else
        echo "🚨 **Overall Status**: CRITICAL" >> "$REPORT_FILE"
        echo "Multiple critical issues detected. Immediate action required." >> "$REPORT_FILE"
        log_message "ERROR" "$issues critical issues detected"
    fi
    
    echo "**Total Issues Found**: $issues" >> "$REPORT_FILE"
}

# レポート最終化
finalize_report() {
    echo "" >> "$REPORT_FILE"
    echo "---" >> "$REPORT_FILE"
    echo "**Report Generated**: $(date)" >> "$REPORT_FILE"
    echo "**Script Version**: $VERSION" >> "$REPORT_FILE"
    echo "**Log File**: $LOG_FILE" >> "$REPORT_FILE"
    
    log_message "INFO" "Diagnostic report saved to: $REPORT_FILE"
    log_message "INFO" "Detailed logs saved to: $LOG_FILE"
}

# メイン実行関数
main() {
    local mode="${1:-full}"
    
    echo "Zabbix Diagnostic Tool v$VERSION"
    echo "=================================="
    echo ""
    
    log_message "INFO" "Starting Zabbix diagnostic (mode: $mode)..."
    
    case "$mode" in
        "quick")
            check_zabbix_processes
            check_database_connectivity
            identify_issues_and_recommendations
            ;;
        "config")
            verify_configuration
            ;;
        "network")
            check_network_connectivity
            ;;
        "performance")
            analyze_performance
            ;;
        "logs")
            analyze_log_files
            ;;
        "full"|*)
            collect_system_info
            check_zabbix_processes
            verify_configuration
            check_database_connectivity
            check_network_connectivity
            analyze_log_files
            analyze_performance
            identify_issues_and_recommendations
            ;;
    esac
    
    finalize_report
    
    echo ""
    echo "Diagnostic completed. Report available at: $REPORT_FILE"
    
    # レポート表示(オプション)
    if [ "${2:-}" = "--show" ]; then
        echo ""
        echo "=== DIAGNOSTIC REPORT ==="
        cat "$REPORT_FILE"
    fi
}

# ヘルプ表示
show_help() {
    echo "Zabbix Diagnostic Tool v$VERSION"
    echo ""
    echo "USAGE:"
    echo "  $0 [mode] [options]"
    echo ""
    echo "MODES:"
    echo "  full        Full diagnostic (default)"
    echo "  quick       Quick health check"
    echo "  config      Configuration verification only"
    echo "  network     Network connectivity check"
    echo "  performance Performance analysis"
    echo "  logs        Log file analysis"
    echo ""
    echo "OPTIONS:"
    echo "  --show      Display report after generation"
    echo "  --help      Show this help message"
    echo ""
    echo "EXAMPLES:"
    echo "  $0                    # Full diagnostic"
    echo "  $0 quick --show       # Quick check with report display"
    echo "  $0 performance        # Performance analysis only"
}

# スクリプト実行
if [ "${1:-}" = "--help" ] || [ "${1:-}" = "-h" ]; then
    show_help
    exit 0
fi

main "$@"

一般的な問題と解決方法

サービス起動問題

Zabbixサーバー起動失敗

yaml
# 起動失敗の一般的原因と対処
起動失敗パターン:
  設定ファイルエラー:
    症状:
      - "cannot parse config file"
      - "invalid parameter"
      - "configuration file error"
    
    確認方法:
      - zabbix_server -t
      - journalctl -u zabbix-server
      - /var/log/zabbix/zabbix_server.log
    
    解決手順:
      1. 設定ファイル構文確認
      2. パラメータ値検証
      3. ファイル権限確認
      4. バックアップからの復旧

  データベース接続エラー:
    症状:
      - "cannot connect to database"
      - "access denied for user"
      - "database connection timeout"
    
    解決手順:
      1. データベースサービス確認
      2. 認証情報検証
      3. ネットワーク接続確認
      4. ファイアウォール設定確認

  権限問題:
    症状:
      - "permission denied"
      - "cannot write to log file"
      - "cannot create PID file"
    
    解決手順:
      1. ファイル・ディレクトリ権限確認
      2. SELinux設定確認
      3. ユーザー・グループ設定確認
      4. プロセス実行権限確認

  リソース不足:
    症状:
      - "cannot allocate memory"
      - "too many open files"
      - "disk space full"
    
    解決手順:
      1. システムリソース確認
      2. ulimit設定確認
      3. ディスク容量確認
      4. メモリ使用量最適化

自動解決スクリプト

bash
#!/bin/bash
# Zabbix起動問題自動解決スクリプト

SERVICE_NAME="zabbix-server"
CONFIG_FILE="/etc/zabbix/zabbix_server.conf"
LOG_FILE="/var/log/zabbix/startup_fix.log"

# ログ関数
log_fix() {
    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
}

# 起動問題診断・修復
fix_startup_issues() {
    log_fix "Starting Zabbix startup issue diagnosis..."
    
    # 1. サービス状態確認
    local service_status=$(systemctl is-active "$SERVICE_NAME")
    local service_enabled=$(systemctl is-enabled "$SERVICE_NAME")
    
    log_fix "Service status: $service_status"
    log_fix "Service enabled: $service_enabled"
    
    if [ "$service_status" != "active" ]; then
        log_fix "Service is not active, attempting fixes..."
        
        # 2. 設定ファイル検証・修復
        if ! zabbix_server -t >/dev/null 2>&1; then
            log_fix "Configuration file has errors, attempting to fix..."
            
            # バックアップから復旧
            if [ -f "${CONFIG_FILE}.backup" ]; then
                log_fix "Restoring from backup configuration..."
                cp "${CONFIG_FILE}.backup" "$CONFIG_FILE"
                
                if zabbix_server -t >/dev/null 2>&1; then
                    log_fix "Configuration restored successfully from backup"
                else
                    log_fix "Backup configuration also has errors"
                fi
            fi
            
            # 基本的な設定エラー修正
            fix_common_config_errors
        fi
        
        # 3. 権限問題修復
        fix_permission_issues
        
        # 4. リソース問題確認・修復
        fix_resource_issues
        
        # 5. データベース接続問題修復
        fix_database_connectivity
        
        # 6. サービス再起動試行
        log_fix "Attempting to restart Zabbix server..."
        systemctl restart "$SERVICE_NAME"
        
        # 起動確認
        sleep 5
        if systemctl is-active "$SERVICE_NAME" >/dev/null; then
            log_fix "✓ Zabbix server started successfully"
            return 0
        else
            log_fix "✗ Zabbix server failed to start"
            
            # 詳細エラー情報収集
            log_fix "Error details:"
            journalctl -u "$SERVICE_NAME" --since "1 minute ago" --no-pager | tail -10 | while read line; do
                log_fix "  $line"
            done
            
            return 1
        fi
    else
        log_fix "✓ Zabbix server is already running"
        return 0
    fi
}

# 設定ファイル基本エラー修正
fix_common_config_errors() {
    log_fix "Fixing common configuration errors..."
    
    # 重複パラメータ削除
    awk '!seen[$0]++' "$CONFIG_FILE" > "${CONFIG_FILE}.tmp" && mv "${CONFIG_FILE}.tmp" "$CONFIG_FILE"
    
    # 無効な文字削除
    sed -i 's/[[:cntrl:]]//g' "$CONFIG_FILE"
    
    # 基本パラメータ確認・追加
    if ! grep -q "^LogFile=" "$CONFIG_FILE"; then
        echo "LogFile=/var/log/zabbix/zabbix_server.log" >> "$CONFIG_FILE"
        log_fix "Added missing LogFile parameter"
    fi
    
    if ! grep -q "^PidFile=" "$CONFIG_FILE"; then
        echo "PidFile=/run/zabbix/zabbix_server.pid" >> "$CONFIG_FILE"
        log_fix "Added missing PidFile parameter"
    fi
    
    # 設定テスト
    if zabbix_server -t >/dev/null 2>&1; then
        log_fix "Configuration errors fixed"
    else
        log_fix "Configuration still has errors"
    fi
}

# 権限問題修復
fix_permission_issues() {
    log_fix "Fixing permission issues..."
    
    # ディレクトリ作成・権限設定
    mkdir -p /var/log/zabbix
    mkdir -p /run/zabbix
    
    chown zabbix:zabbix /var/log/zabbix
    chown zabbix:zabbix /run/zabbix
    
    chmod 755 /var/log/zabbix
    chmod 755 /run/zabbix
    
    # 設定ファイル権限
    chown root:zabbix "$CONFIG_FILE"
    chmod 640 "$CONFIG_FILE"
    
    # SELinux設定確認
    if command -v getenforce >/dev/null && [ "$(getenforce)" = "Enforcing" ]; then
        log_fix "Checking SELinux settings..."
        
        # SELinuxコンテキスト復旧
        restorecon -R /var/log/zabbix
        restorecon -R /run/zabbix
        restorecon "$CONFIG_FILE"
        
        # 必要なSELinuxポリシー設定
        setsebool -P zabbix_can_network on 2>/dev/null || true
        setsebool -P httpd_can_network_connect on 2>/dev/null || true
    fi
    
    log_fix "Permission issues fixed"
}

# リソース問題修復
fix_resource_issues() {
    log_fix "Checking and fixing resource issues..."
    
    # ディスク容量確認
    local disk_usage=$(df /var | awk 'NR==2 {print $5}' | sed 's/%//')
    if [ "$disk_usage" -gt 95 ]; then
        log_fix "Critical disk space issue detected (${disk_usage}%)"
        
        # ログローテーション強制実行
        logrotate -f /etc/logrotate.conf
        
        # 古いログファイル削除
        find /var/log -name "*.log.*" -mtime +7 -delete
        
        log_fix "Disk space cleanup completed"
    fi
    
    # メモリ確認
    local mem_available=$(free | grep Available | awk '{print $2}')
    if [ "$mem_available" -lt 1048576 ]; then  # 1GB未満
        log_fix "Low memory detected, clearing cache..."
        sync && echo 3 > /proc/sys/vm/drop_caches
    fi
    
    # ulimit設定確認
    local current_limit=$(ulimit -n)
    if [ "$current_limit" -lt 4096 ]; then
        log_fix "Increasing file descriptor limit..."
        echo "zabbix soft nofile 65536" >> /etc/security/limits.conf
        echo "zabbix hard nofile 65536" >> /etc/security/limits.conf
    fi
}

# データベース接続修復
fix_database_connectivity() {
    log_fix "Checking database connectivity..."
    
    # データベース設定取得
    local db_host=$(grep "^DBHost=" "$CONFIG_FILE" | cut -d'=' -f2)
    local db_name=$(grep "^DBName=" "$CONFIG_FILE" | cut -d'=' -f2)
    local db_user=$(grep "^DBUser=" "$CONFIG_FILE" | cut -d'=' -f2)
    local db_password=$(grep "^DBPassword=" "$CONFIG_FILE" | cut -d'=' -f2)
    
    if [ -n "$db_host" ] && [ -n "$db_name" ] && [ -n "$db_user" ]; then
        # データベース接続テスト
        if ! mysql -h "$db_host" -u "$db_user" -p"$db_password" -e "SELECT 1;" "$db_name" >/dev/null 2>&1; then
            log_fix "Database connection failed, attempting fixes..."
            
            # データベースサービス確認・起動
            if systemctl is-active mysql >/dev/null || systemctl is-active mariadb >/dev/null; then
                log_fix "Database service is running"
            else
                log_fix "Starting database service..."
                systemctl start mysql 2>/dev/null || systemctl start mariadb 2>/dev/null
                sleep 3
            fi
            
            # 再接続テスト
            if mysql -h "$db_host" -u "$db_user" -p"$db_password" -e "SELECT 1;" "$db_name" >/dev/null 2>&1; then
                log_fix "✓ Database connection restored"
            else
                log_fix "✗ Database connection still failing"
                
                # ネットワーク接続確認
                if [ "$db_host" != "localhost" ] && [ "$db_host" != "127.0.0.1" ]; then
                    if ! ping -c 1 "$db_host" >/dev/null 2>&1; then
                        log_fix "Network connectivity to database host failed"
                    fi
                fi
            fi
        else
            log_fix "✓ Database connection is working"
        fi
    else
        log_fix "Database configuration parameters missing"
    fi
}

# メイン実行
main() {
    echo "Zabbix Startup Issue Resolver"
    echo "============================="
    
    if fix_startup_issues; then
        echo "✓ Zabbix startup issues resolved successfully"
        exit 0
    else
        echo "✗ Failed to resolve all startup issues"
        echo "Check log file: $LOG_FILE"
        exit 1
    fi
}

# スクリプト実行
main "$@"

パフォーマンス問題

パフォーマンス問題の特定

性能監視・分析システム

python
#!/usr/bin/env python3
"""Zabbix性能分析・最適化システム"""

import psutil
import mysql.connector
import time
import json
import subprocess
from datetime import datetime, timedelta
from typing import Dict, List, Tuple

class ZabbixPerformanceAnalyzer:
    def __init__(self, db_config: Dict):
        self.db_config = db_config
        self.connection = None
        self.performance_data = {}
        
    def connect_database(self):
        """データベース接続"""
        try:
            self.connection = mysql.connector.connect(**self.db_config)
            return True
        except Exception as e:
            print(f"Database connection failed: {e}")
            return False
    
    def analyze_system_performance(self) -> Dict:
        """システム性能分析"""
        print("Analyzing system performance...")
        
        # CPU使用率
        cpu_percent = psutil.cpu_percent(interval=1)
        cpu_count = psutil.cpu_count()
        
        # メモリ使用状況
        memory = psutil.virtual_memory()
        
        # ディスクI/O
        disk_io = psutil.disk_io_counters()
        
        # ネットワークI/O
        network_io = psutil.net_io_counters()
        
        # プロセス情報
        zabbix_processes = []
        for proc in psutil.process_iter(['pid', 'name', 'cpu_percent', 'memory_percent']):
            if 'zabbix' in proc.info['name']:
                zabbix_processes.append(proc.info)
        
        return {
            'timestamp': datetime.now().isoformat(),
            'cpu': {
                'usage_percent': cpu_percent,
                'core_count': cpu_count,
                'load_average': psutil.getloadavg()
            },
            'memory': {
                'total': memory.total,
                'available': memory.available,
                'used': memory.used,
                'usage_percent': memory.percent
            },
            'disk_io': {
                'read_bytes': disk_io.read_bytes,
                'write_bytes': disk_io.write_bytes,
                'read_count': disk_io.read_count,
                'write_count': disk_io.write_count
            },
            'network_io': {
                'bytes_sent': network_io.bytes_sent,
                'bytes_recv': network_io.bytes_recv,
                'packets_sent': network_io.packets_sent,
                'packets_recv': network_io.packets_recv
            },
            'zabbix_processes': zabbix_processes
        }
    
    def analyze_database_performance(self) -> Dict:
        """データベース性能分析"""
        print("Analyzing database performance...")
        
        if not self.connection:
            return {}
        
        cursor = self.connection.cursor(dictionary=True)
        
        # データベース統計情報
        queries = {
            'status_variables': "SHOW STATUS",
            'process_list': "SHOW PROCESSLIST",
            'table_sizes': """
                SELECT 
                    table_name,
                    ROUND(((data_length + index_length) / 1024 / 1024), 2) AS 'size_mb',
                    table_rows
                FROM information_schema.TABLES 
                WHERE table_schema = %s
                ORDER BY (data_length + index_length) DESC
                LIMIT 10
            """,
            'slow_queries': """
                SELECT 
                    DIGEST_TEXT,
                    COUNT_STAR,
                    AVG_TIMER_WAIT/1000000000 as avg_time_sec,
                    MAX_TIMER_WAIT/1000000000 as max_time_sec
                FROM performance_schema.events_statements_summary_by_digest 
                WHERE AVG_TIMER_WAIT > 1000000000
                ORDER BY AVG_TIMER_WAIT DESC 
                LIMIT 10
            """
        }
        
        results = {}
        
        try:
            # ステータス変数
            cursor.execute(queries['status_variables'])
            status_vars = {row['Variable_name']: row['Value'] for row in cursor.fetchall()}
            results['status_variables'] = status_vars
            
            # プロセスリスト
            cursor.execute(queries['process_list'])
            process_list = cursor.fetchall()
            results['active_connections'] = len(process_list)
            results['process_list'] = process_list
            
            # テーブルサイズ
            cursor.execute(queries['table_sizes'], (self.db_config['database'],))
            results['largest_tables'] = cursor.fetchall()
            
            # スロークエリ(performance_schemaが有効な場合)
            try:
                cursor.execute(queries['slow_queries'])
                results['slow_queries'] = cursor.fetchall()
            except:
                results['slow_queries'] = []
            
        except Exception as e:
            print(f"Database analysis error: {e}")
        
        return results
    
    def analyze_zabbix_internals(self) -> Dict:
        """Zabbix内部メトリクス分析"""
        print("Analyzing Zabbix internal metrics...")
        
        if not self.connection:
            return {}
        
        cursor = self.connection.cursor(dictionary=True)
        
        internal_queries = {
            'host_count': "SELECT COUNT(*) as count FROM hosts WHERE status = 0",
            'item_count': "SELECT COUNT(*) as count FROM items WHERE status = 0",
            'trigger_count': "SELECT COUNT(*) as count FROM triggers WHERE status = 0",
            'active_problems': "SELECT COUNT(*) as count FROM problem WHERE source = 0 AND object = 0",
            'queue_stats': """
                SELECT 
                    type,
                    COUNT(*) as item_count,
                    AVG(delay) as avg_delay
                FROM items i
                JOIN hosts h ON i.hostid = h.hostid
                WHERE i.status = 0 AND h.status = 0
                GROUP BY type
            """,
            'data_freshness': """
                SELECT 
                    COUNT(CASE WHEN lastvalue_ts > UNIX_TIMESTAMP() - 3600 THEN 1 END) as updated_last_hour,
                    COUNT(CASE WHEN lastvalue_ts > UNIX_TIMESTAMP() - 86400 THEN 1 END) as updated_last_day,
                    COUNT(*) as total_items
                FROM items i
                JOIN hosts h ON i.hostid = h.hostid
                WHERE i.status = 0 AND h.status = 0
            """,
            'history_size': """
                SELECT 
                    'history' as table_name,
                    COUNT(*) as record_count,
                    MIN(FROM_UNIXTIME(clock)) as oldest_record,
                    MAX(FROM_UNIXTIME(clock)) as newest_record
                FROM history
                UNION ALL
                SELECT 
                    'history_uint' as table_name,
                    COUNT(*) as record_count,
                    MIN(FROM_UNIXTIME(clock)) as oldest_record,
                    MAX(FROM_UNIXTIME(clock)) as newest_record
                FROM history_uint
                UNION ALL
                SELECT 
                    'history_str' as table_name,
                    COUNT(*) as record_count,
                    MIN(FROM_UNIXTIME(clock)) as oldest_record,
                    MAX(FROM_UNIXTIME(clock)) as newest_record
                FROM history_str
            """
        }
        
        results = {}
        
        try:
            for query_name, query in internal_queries.items():
                cursor.execute(query)
                
                if query_name in ['host_count', 'item_count', 'trigger_count', 'active_problems']:
                    results[query_name] = cursor.fetchone()['count']
                elif query_name == 'data_freshness':
                    results[query_name] = cursor.fetchone()
                else:
                    results[query_name] = cursor.fetchall()
                    
        except Exception as e:
            print(f"Zabbix internals analysis error: {e}")
        
        return results
    
    def identify_performance_bottlenecks(self, system_data: Dict, db_data: Dict, zabbix_data: Dict) -> List[Dict]:
        """性能ボトルネック特定"""
        print("Identifying performance bottlenecks...")
        
        bottlenecks = []
        
        # CPU使用率チェック
        if system_data.get('cpu', {}).get('usage_percent', 0) > 80:
            bottlenecks.append({
                'category': 'CPU',
                'severity': 'high',
                'description': f"High CPU usage: {system_data['cpu']['usage_percent']:.1f}%",
                'recommendation': "Investigate CPU-intensive processes and optimize Zabbix configuration"
            })
        
        # メモリ使用率チェック
        memory_usage = system_data.get('memory', {}).get('usage_percent', 0)
        if memory_usage > 85:
            bottlenecks.append({
                'category': 'Memory',
                'severity': 'high',
                'description': f"High memory usage: {memory_usage:.1f}%",
                'recommendation': "Check for memory leaks and optimize cache settings"
            })
        
        # データベース接続数チェック
        active_connections = db_data.get('active_connections', 0)
        max_connections = int(db_data.get('status_variables', {}).get('max_connections', 0))
        
        if max_connections > 0 and active_connections / max_connections > 0.8:
            bottlenecks.append({
                'category': 'Database',
                'severity': 'high',
                'description': f"High database connection usage: {active_connections}/{max_connections}",
                'recommendation': "Optimize database queries and connection pooling"
            })
        
        # スロークエリチェック
        slow_queries = db_data.get('slow_queries', [])
        if len(slow_queries) > 0:
            bottlenecks.append({
                'category': 'Database',
                'severity': 'medium',
                'description': f"Found {len(slow_queries)} slow queries",
                'recommendation': "Optimize slow queries and add appropriate indexes"
            })
        
        # データ更新状況チェック
        data_freshness = zabbix_data.get('data_freshness', {})
        if data_freshness:
            total_items = data_freshness.get('total_items', 0)
            updated_last_hour = data_freshness.get('updated_last_hour', 0)
            
            if total_items > 0 and updated_last_hour / total_items < 0.8:
                bottlenecks.append({
                    'category': 'Data Collection',
                    'severity': 'medium',
                    'description': f"Low data freshness: {updated_last_hour}/{total_items} items updated in last hour",
                    'recommendation': "Check agent connectivity and polling configuration"
                })
        
        # 大きなテーブルサイズチェック
        largest_tables = db_data.get('largest_tables', [])
        for table in largest_tables[:3]:  # 上位3テーブル
            size_mb = table.get('size_mb', 0)
            if size_mb > 10000:  # 10GB以上
                bottlenecks.append({
                    'category': 'Storage',
                    'severity': 'medium',
                    'description': f"Large table: {table['table_name']} ({size_mb:.1f} MB)",
                    'recommendation': "Consider data archiving and partitioning"
                })
        
        return bottlenecks
    
    def generate_optimization_recommendations(self, bottlenecks: List[Dict]) -> Dict:
        """最適化推奨事項生成"""
        print("Generating optimization recommendations...")
        
        recommendations = {
            'immediate_actions': [],
            'configuration_changes': [],
            'infrastructure_improvements': [],
            'monitoring_enhancements': []
        }
        
        for bottleneck in bottlenecks:
            category = bottleneck['category']
            severity = bottleneck['severity']
            
            if category == 'CPU' and severity == 'high':
                recommendations['immediate_actions'].append(
                    "Review and optimize Zabbix server processes and polling intervals"
                )
                recommendations['configuration_changes'].append(
                    "Adjust StartPollers, StartPingers parameters in zabbix_server.conf"
                )
            
            elif category == 'Memory' and severity == 'high':
                recommendations['immediate_actions'].append(
                    "Check for memory leaks in Zabbix processes"
                )
                recommendations['configuration_changes'].append(
                    "Optimize cache sizes: VMwareCacheSize, SNMPTrapperFile, etc."
                )
            
            elif category == 'Database':
                recommendations['configuration_changes'].extend([
                    "Optimize MySQL/PostgreSQL configuration for Zabbix workload",
                    "Implement database partitioning for history tables",
                    "Add appropriate indexes for frequently queried columns"
                ])
                recommendations['infrastructure_improvements'].append(
                    "Consider SSD storage for database files"
                )
            
            elif category == 'Data Collection':
                recommendations['configuration_changes'].extend([
                    "Review and optimize item polling intervals",
                    "Implement bulk data collection where possible",
                    "Check agent connectivity and timeout settings"
                ])
            
            elif category == 'Storage':
                recommendations['immediate_actions'].append(
                    "Implement data housekeeping policies"
                )
                recommendations['infrastructure_improvements'].append(
                    "Plan storage capacity expansion"
                )
        
        # 監視改善提案
        recommendations['monitoring_enhancements'].extend([
            "Implement Zabbix self-monitoring templates",
            "Set up performance baseline monitoring",
            "Create capacity planning dashboards",
            "Establish performance alerting thresholds"
        ])
        
        return recommendations
    
    def run_complete_analysis(self) -> Dict:
        """完全な性能分析実行"""
        print("Starting comprehensive performance analysis...")
        
        # データベース接続
        if not self.connect_database():
            return {'error': 'Database connection failed'}
        
        try:
            # 各種分析実行
            system_data = self.analyze_system_performance()
            db_data = self.analyze_database_performance()
            zabbix_data = self.analyze_zabbix_internals()
            
            # ボトルネック特定
            bottlenecks = self.identify_performance_bottlenecks(system_data, db_data, zabbix_data)
            
            # 最適化推奨事項生成
            recommendations = self.generate_optimization_recommendations(bottlenecks)
            
            # 結果統合
            analysis_result = {
                'timestamp': datetime.now().isoformat(),
                'system_performance': system_data,
                'database_performance': db_data,
                'zabbix_internals': zabbix_data,
                'bottlenecks': bottlenecks,
                'recommendations': recommendations,
                'summary': {
                    'total_bottlenecks': len(bottlenecks),
                    'high_severity_issues': len([b for b in bottlenecks if b['severity'] == 'high']),
                    'overall_status': 'critical' if len([b for b in bottlenecks if b['severity'] == 'high']) > 0 else 'warning' if bottlenecks else 'healthy'
                }
            }
            
            return analysis_result
            
        finally:
            if self.connection:
                self.connection.close()
    
    def save_analysis_report(self, analysis_result: Dict, filename: str = None):
        """分析レポート保存"""
        if not filename:
            filename = f"zabbix_performance_analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
        
        with open(filename, 'w') as f:
            json.dump(analysis_result, f, indent=2, default=str)
        
        print(f"Analysis report saved to: {filename}")
        return filename

# 使用例
def main():
    # データベース設定
    db_config = {
        'host': 'localhost',
        'database': 'zabbix',
        'user': 'zabbix',
        'password': 'zabbix_password'
    }
    
    # 性能分析実行
    analyzer = ZabbixPerformanceAnalyzer(db_config)
    result = analyzer.run_complete_analysis()
    
    if 'error' in result:
        print(f"Analysis failed: {result['error']}")
        return
    
    # レポート保存
    report_file = analyzer.save_analysis_report(result)
    
    # サマリー表示
    print("\n" + "="*50)
    print("PERFORMANCE ANALYSIS SUMMARY")
    print("="*50)
    
    summary = result['summary']
    print(f"Overall Status: {summary['overall_status'].upper()}")
    print(f"Total Issues: {summary['total_bottlenecks']}")
    print(f"High Severity: {summary['high_severity_issues']}")
    
    if result['bottlenecks']:
        print("\nTop Issues:")
        for i, bottleneck in enumerate(result['bottlenecks'][:5], 1):
            print(f"{i}. [{bottleneck['category']}] {bottleneck['description']}")
    
    print(f"\nDetailed report: {report_file}")

if __name__ == "__main__":
    main()

ネットワーク問題

ネットワーク接続問題の診断

包括的ネットワーク診断

yaml
# ネットワーク問題分類
ネットワーク問題カテゴリ:
  エージェント接続問題:
    症状:
      - "Host is unreachable"
      - "Agent timeout"
      - "Connection refused"
      - "No route to host"
    
    診断手順:
      1. 基本接続確認 (ping)
      2. ポート到達性確認 (telnet/nc)
      3. ファイアウォール確認
      4. エージェント状態確認
      5. ネットワーク経路確認
    
    一般的原因:
      - ファイアウォール設定
      - エージェント停止
      - ネットワーク設定変更
      - DNS解決問題

  性能問題:
    症状:
      - データ収集遅延
      - タイムアウトエラー頻発
      - 不安定な接続
      - パケットロス
    
    診断項目:
      - ネットワーク遅延 (RTT)
      - 帯域使用率
      - パケット損失率
      - TCP接続統計
    
    解決アプローチ:
      - 監視間隔調整
      - タイムアウト値調整
      - 帯域制限実装
      - QoS設定

  セキュリティ制限:
    症状:
      - 暗号化接続失敗
      - 認証エラー
      - 証明書エラー
      - アクセス拒否
    
    確認項目:
      - TLS/SSL設定
      - 証明書有効性
      - 暗号化設定
      - アクセス制御設定

自動ネットワーク診断スクリプト

bash
#!/bin/bash
# Zabbixネットワーク診断・修復スクリプト

SCRIPT_NAME="zabbix-network-diagnostic"
LOG_FILE="/var/log/zabbix/network_diagnostic.log"
REPORT_FILE="/tmp/network_diagnostic_report.txt"
CONFIG_FILE="/etc/zabbix/zabbix_server.conf"

# ログ関数
log_diagnostic() {
    local level="$1"
    local message="$2"
    local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
    
    echo "[$timestamp] [$level] $message" | tee -a "$LOG_FILE"
}

# ネットワーク基本診断
diagnose_basic_connectivity() {
    log_diagnostic "INFO" "Starting basic network connectivity diagnosis..."
    
    echo "# Network Connectivity Diagnosis Report" > "$REPORT_FILE"
    echo "Generated: $(date)" >> "$REPORT_FILE"
    echo "" >> "$REPORT_FILE"
    
    # ローカルインターフェース確認
    echo "## Local Network Interfaces" >> "$REPORT_FILE"
    ip addr show | grep -E "^[0-9]|inet " >> "$REPORT_FILE"
    echo "" >> "$REPORT_FILE"
    
    # ルーティングテーブル確認
    echo "## Routing Table" >> "$REPORT_FILE"
    ip route show >> "$REPORT_FILE"
    echo "" >> "$REPORT_FILE"
    
    # DNS設定確認
    echo "## DNS Configuration" >> "$REPORT_FILE"
    cat /etc/resolv.conf >> "$REPORT_FILE"
    echo "" >> "$REPORT_FILE"
    
    # ポート使用状況確認
    echo "## Zabbix Port Status" >> "$REPORT_FILE"
    
    # Zabbixサーバーポート (10051)
    if netstat -ln | grep -q ":10051.*LISTEN"; then
        echo "✓ Zabbix Server Port 10051: LISTENING" >> "$REPORT_FILE"
        log_diagnostic "SUCCESS" "Zabbix server port 10051 is listening"
    else
        echo "✗ Zabbix Server Port 10051: NOT LISTENING" >> "$REPORT_FILE"
        log_diagnostic "ERROR" "Zabbix server port 10051 is not listening"
    fi
    
    # Zabbixエージェントポート (10050)
    if netstat -ln | grep -q ":10050.*LISTEN"; then
        echo "✓ Zabbix Agent Port 10050: LISTENING" >> "$REPORT_FILE"
        log_diagnostic "SUCCESS" "Zabbix agent port 10050 is listening"
    else
        echo "⚠ Zabbix Agent Port 10050: NOT LISTENING" >> "$REPORT_FILE"
        log_diagnostic "WARNING" "Zabbix agent port 10050 is not listening"
    fi
    
    echo "" >> "$REPORT_FILE"
}

# エージェント接続テスト
test_agent_connectivity() {
    log_diagnostic "INFO" "Testing agent connectivity..."
    
    echo "## Agent Connectivity Tests" >> "$REPORT_FILE"
    
    # データベースからエージェントホスト一覧取得
    local db_host=$(grep "^DBHost=" "$CONFIG_FILE" | cut -d'=' -f2)
    local db_name=$(grep "^DBName=" "$CONFIG_FILE" | cut -d'=' -f2)
    local db_user=$(grep "^DBUser=" "$CONFIG_FILE" | cut -d'=' -f2)
    local db_password=$(grep "^DBPassword=" "$CONFIG_FILE" | cut -d'=' -f2)
    
    if [ -n "$db_host" ] && [ -n "$db_name" ] && [ -n "$db_user" ]; then
        # アクティブなホスト一覧取得
        local hosts=$(mysql -h "$db_host" -u "$db_user" -p"$db_password" -e "
            SELECT h.host, i.ip, i.port
            FROM hosts h
            JOIN interface i ON h.hostid = i.hostid
            WHERE h.status = 0 AND i.type = 1
            LIMIT 20;" -s -N "$db_name")
        
        if [ -n "$hosts" ]; then
            echo "### Active Host Connectivity Tests" >> "$REPORT_FILE"
            
            while IFS=$'\t' read -r hostname ip port; do
                test_single_host_connectivity "$hostname" "$ip" "$port"
            done <<< "$hosts"
        else
            echo "No active hosts found in database" >> "$REPORT_FILE"
        fi
    else
        echo "Database configuration not available for host testing" >> "$REPORT_FILE"
    fi
    
    echo "" >> "$REPORT_FILE"
}

# 単一ホスト接続テスト
test_single_host_connectivity() {
    local hostname="$1"
    local ip="$2"
    local port="${3:-10050}"
    
    echo "#### Testing: $hostname ($ip:$port)" >> "$REPORT_FILE"
    
    # Ping テスト
    if ping -c 3 -W 2 "$ip" >/dev/null 2>&1; then
        echo "  ✓ Ping: SUCCESS" >> "$REPORT_FILE"
    else
        echo "  ✗ Ping: FAILED" >> "$REPORT_FILE"
        log_diagnostic "WARNING" "Ping failed for $hostname ($ip)"
        return 1
    fi
    
    # ポート接続テスト
    if timeout 5 bash -c "</dev/tcp/$ip/$port" 2>/dev/null; then
        echo "  ✓ Port $port: ACCESSIBLE" >> "$REPORT_FILE"
    else
        echo "  ✗ Port $port: NOT ACCESSIBLE" >> "$REPORT_FILE"
        log_diagnostic "WARNING" "Port $port not accessible on $hostname ($ip)"
        return 1
    fi
    
    # Zabbixエージェント応答テスト
    if command -v zabbix_get >/dev/null; then
        local agent_response=$(timeout 10 zabbix_get -s "$ip" -p "$port" -k "agent.ping" 2>/dev/null)
        if [ "$agent_response" = "1" ]; then
            echo "  ✓ Agent Response: SUCCESS" >> "$REPORT_FILE"
        else
            echo "  ✗ Agent Response: FAILED ($agent_response)" >> "$REPORT_FILE"
            log_diagnostic "WARNING" "Agent response failed for $hostname ($ip)"
        fi
    else
        echo "  - Agent Response: zabbix_get not available" >> "$REPORT_FILE"
    fi
    
    echo "" >> "$REPORT_FILE"
}

# ファイアウォール診断
diagnose_firewall() {
    log_diagnostic "INFO" "Diagnosing firewall configuration..."
    
    echo "## Firewall Diagnosis" >> "$REPORT_FILE"
    
    # firewalld確認
    if systemctl is-active firewalld >/dev/null 2>&1; then
        echo "### Firewalld Configuration" >> "$REPORT_FILE"
        echo "Status: Active" >> "$REPORT_FILE"
        
        # 現在のゾーン
        local current_zone=$(firewall-cmd --get-default-zone 2>/dev/null)
        echo "Default Zone: $current_zone" >> "$REPORT_FILE"
        
        # 開放ポート確認
        echo "Open Ports:" >> "$REPORT_FILE"
        firewall-cmd --list-ports 2>/dev/null >> "$REPORT_FILE"
        
        # サービス確認
        echo "Allowed Services:" >> "$REPORT_FILE"
        firewall-cmd --list-services 2>/dev/null >> "$REPORT_FILE"
        
        # Zabbixポート確認
        if firewall-cmd --query-port=10051/tcp >/dev/null 2>&1; then
            echo "✓ Zabbix Server Port 10051: ALLOWED" >> "$REPORT_FILE"
        else
            echo "✗ Zabbix Server Port 10051: BLOCKED" >> "$REPORT_FILE"
            log_diagnostic "WARNING" "Firewall blocking Zabbix server port 10051"
        fi
        
        if firewall-cmd --query-port=10050/tcp >/dev/null 2>&1; then
            echo "✓ Zabbix Agent Port 10050: ALLOWED" >> "$REPORT_FILE"
        else
            echo "✗ Zabbix Agent Port 10050: BLOCKED" >> "$REPORT_FILE"
            log_diagnostic "WARNING" "Firewall blocking Zabbix agent port 10050"
        fi
        
    # iptables確認
    elif command -v iptables >/dev/null; then
        echo "### Iptables Configuration" >> "$REPORT_FILE"
        
        # INPUT チェーン確認
        local input_rules=$(iptables -L INPUT -n | grep -E "(10050|10051)")
        if [ -n "$input_rules" ]; then
            echo "Zabbix Related Rules:" >> "$REPORT_FILE"
            echo "$input_rules" >> "$REPORT_FILE"
        else
            echo "No specific Zabbix rules found in INPUT chain" >> "$REPORT_FILE"
            
            # デフォルトポリシー確認
            local default_policy=$(iptables -L INPUT | head -1 | grep -o "policy [A-Z]*" | cut -d' ' -f2)
            echo "Default INPUT policy: $default_policy" >> "$REPORT_FILE"
            
            if [ "$default_policy" = "DROP" ] || [ "$default_policy" = "REJECT" ]; then
                log_diagnostic "WARNING" "Restrictive firewall policy may block Zabbix traffic"
            fi
        fi
        
    else
        echo "No recognized firewall system detected" >> "$REPORT_FILE"
    fi
    
    echo "" >> "$REPORT_FILE"
}

# ネットワーク性能診断
diagnose_network_performance() {
    log_diagnostic "INFO" "Diagnosing network performance..."
    
    echo "## Network Performance Analysis" >> "$REPORT_FILE"
    
    # ネットワーク統計
    echo "### Network Interface Statistics" >> "$REPORT_FILE"
    cat /proc/net/dev | head -2 >> "$REPORT_FILE"
    cat /proc/net/dev | grep -E "(eth|ens|enp)" >> "$REPORT_FILE"
    echo "" >> "$REPORT_FILE"
    
    # TCP接続統計
    echo "### TCP Connection Statistics" >> "$REPORT_FILE"
    ss -s >> "$REPORT_FILE"
    echo "" >> "$REPORT_FILE"
    
    # アクティブなZabbix接続
    echo "### Active Zabbix Connections" >> "$REPORT_FILE"
    ss -tuln | grep -E "(10050|10051)" >> "$REPORT_FILE"
    
    echo "### Established Connections to Zabbix Ports" >> "$REPORT_FILE"
    ss -tuna | grep -E "(10050|10051)" | grep ESTAB | wc -l | \
        xargs echo "Established connections:" >> "$REPORT_FILE"
    
    echo "" >> "$REPORT_FILE"
}

# DNS解決確認
test_dns_resolution() {
    log_diagnostic "INFO" "Testing DNS resolution..."
    
    echo "## DNS Resolution Tests" >> "$REPORT_FILE"
    
    # データベースからホスト名一覧取得
    local db_host=$(grep "^DBHost=" "$CONFIG_FILE" | cut -d'=' -f2)
    local db_name=$(grep "^DBName=" "$CONFIG_FILE" | cut -d'=' -f2)
    local db_user=$(grep "^DBUser=" "$CONFIG_FILE" | cut -d'=' -f2)
    local db_password=$(grep "^DBPassword=" "$CONFIG_FILE" | cut -d'=' -f2)
    
    if [ -n "$db_host" ] && [ -n "$db_name" ] && [ -n "$db_user" ]; then
        # DNS名を使用しているホスト確認
        local dns_hosts=$(mysql -h "$db_host" -u "$db_user" -p"$db_password" -e "
            SELECT DISTINCT i.dns
            FROM interface i
            JOIN hosts h ON i.hostid = h.hostid
            WHERE h.status = 0 AND i.type = 1 AND i.dns != ''
            LIMIT 10;" -s -N "$db_name")
        
        if [ -n "$dns_hosts" ]; then
            echo "### DNS Resolution Tests for Monitored Hosts" >> "$REPORT_FILE"
            
            while read -r dns_name; do
                if [ -n "$dns_name" ]; then
                    local resolved_ip=$(dig +short "$dns_name" 2>/dev/null | head -1)
                    if [ -n "$resolved_ip" ]; then
                        echo "✓ $dns_name -> $resolved_ip" >> "$REPORT_FILE"
                    else
                        echo "✗ $dns_name -> RESOLUTION FAILED" >> "$REPORT_FILE"
                        log_diagnostic "WARNING" "DNS resolution failed for $dns_name"
                    fi
                fi
            done <<< "$dns_hosts"
        else
            echo "No DNS-based hosts found" >> "$REPORT_FILE"
        fi
    fi
    
    echo "" >> "$REPORT_FILE"
}

# 自動修復機能
auto_fix_network_issues() {
    log_diagnostic "INFO" "Attempting automatic network issue fixes..."
    
    echo "## Automatic Fix Attempts" >> "$REPORT_FILE"
    
    local fixes_applied=0
    
    # ファイアウォール自動設定
    if systemctl is-active firewalld >/dev/null 2>&1; then
        # Zabbixサーバーポート開放
        if ! firewall-cmd --query-port=10051/tcp >/dev/null 2>&1; then
            if firewall-cmd --permanent --add-port=10051/tcp >/dev/null 2>&1; then
                firewall-cmd --reload >/dev/null 2>&1
                echo "✓ Opened Zabbix server port 10051 in firewall" >> "$REPORT_FILE"
                log_diagnostic "SUCCESS" "Opened firewall port 10051"
                fixes_applied=$((fixes_applied + 1))
            else
                echo "✗ Failed to open Zabbix server port 10051" >> "$REPORT_FILE"
                log_diagnostic "ERROR" "Failed to open firewall port 10051"
            fi
        fi
        
        # Zabbixエージェントポート開放
        if ! firewall-cmd --query-port=10050/tcp >/dev/null 2>&1; then
            if firewall-cmd --permanent --add-port=10050/tcp >/dev/null 2>&1; then
                firewall-cmd --reload >/dev/null 2>&1
                echo "✓ Opened Zabbix agent port 10050 in firewall" >> "$REPORT_FILE"
                log_diagnostic "SUCCESS" "Opened firewall port 10050"
                fixes_applied=$((fixes_applied + 1))
            else
                echo "✗ Failed to open Zabbix agent port 10050" >> "$REPORT_FILE"
                log_diagnostic "ERROR" "Failed to open firewall port 10050"
            fi
        fi
    fi
    
    # Zabbixエージェント起動確認・修復
    if ! systemctl is-active zabbix-agent >/dev/null 2>&1; then
        if systemctl start zabbix-agent >/dev/null 2>&1; then
            echo "✓ Started Zabbix agent service" >> "$REPORT_FILE"
            log_diagnostic "SUCCESS" "Started Zabbix agent service"
            fixes_applied=$((fixes_applied + 1))
        else
            echo "✗ Failed to start Zabbix agent service" >> "$REPORT_FILE"
            log_diagnostic "ERROR" "Failed to start Zabbix agent service"
        fi
    fi
    
    # ネットワーク設定確認・修復
    # DNS設定確認
    if ! grep -q "nameserver" /etc/resolv.conf; then
        echo "nameserver 8.8.8.8" >> /etc/resolv.conf
        echo "nameserver 8.8.4.4" >> /etc/resolv.conf
        echo "✓ Added fallback DNS servers" >> "$REPORT_FILE"
        log_diagnostic "SUCCESS" "Added fallback DNS servers"
        fixes_applied=$((fixes_applied + 1))
    fi
    
    echo "" >> "$REPORT_FILE"
    echo "Total automatic fixes applied: $fixes_applied" >> "$REPORT_FILE"
    
    if [ $fixes_applied -gt 0 ]; then
        echo "⚠ Please test connectivity after automatic fixes" >> "$REPORT_FILE"
        log_diagnostic "INFO" "Applied $fixes_applied automatic fixes"
    fi
    
    echo "" >> "$REPORT_FILE"
}

# 推奨事項生成
generate_recommendations() {
    log_diagnostic "INFO" "Generating network recommendations..."
    
    echo "## Recommendations" >> "$REPORT_FILE"
    
    # 一般的な推奨事項
    echo "### General Network Recommendations" >> "$REPORT_FILE"
    echo "1. Regularly monitor network connectivity to all monitored hosts" >> "$REPORT_FILE"
    echo "2. Implement network redundancy for critical monitoring paths" >> "$REPORT_FILE"
    echo "3. Configure appropriate timeouts based on network characteristics" >> "$REPORT_FILE"
    echo "4. Use encrypted connections (PSK/certificates) for secure environments" >> "$REPORT_FILE"
    echo "5. Monitor network performance metrics (latency, packet loss)" >> "$REPORT_FILE"
    echo "" >> "$REPORT_FILE"
    
    # 自動化推奨事項
    echo "### Automation Recommendations" >> "$REPORT_FILE"
    echo "1. Schedule regular network diagnostics" >> "$REPORT_FILE"
    echo "2. Implement automated agent health monitoring" >> "$REPORT_FILE"
    echo "3. Set up network path monitoring" >> "$REPORT_FILE"
    echo "4. Create network troubleshooting runbooks" >> "$REPORT_FILE"
    echo "" >> "$REPORT_FILE"
    
    # スクリプト使用方法
    echo "### Script Usage" >> "$REPORT_FILE"
    echo "Run this script regularly:" >> "$REPORT_FILE"
    echo "- Daily: $0 quick" >> "$REPORT_FILE"
    echo "- Weekly: $0 full" >> "$REPORT_FILE"
    echo "- After changes: $0 full --fix" >> "$REPORT_FILE"
    echo "" >> "$REPORT_FILE"
}

# メイン実行関数
main() {
    local mode="${1:-full}"
    local auto_fix="${2:-}"
    
    echo "Zabbix Network Diagnostic Tool"
    echo "==============================="
    
    log_diagnostic "INFO" "Starting network diagnostic (mode: $mode)..."
    
    case "$mode" in
        "quick")
            diagnose_basic_connectivity
            test_agent_connectivity
            ;;
        "firewall")
            diagnose_firewall
            ;;
        "performance")
            diagnose_network_performance
            ;;
        "dns")
            test_dns_resolution
            ;;
        "full"|*)
            diagnose_basic_connectivity
            test_agent_connectivity
            diagnose_firewall
            diagnose_network_performance
            test_dns_resolution
            ;;
    esac
    
    # 自動修復実行(オプション)
    if [ "$auto_fix" = "--fix" ]; then
        auto_fix_network_issues
    fi
    
    generate_recommendations
    
    echo ""
    echo "Network diagnostic completed."
    echo "Report saved to: $REPORT_FILE"
    echo "Log file: $LOG_FILE"
    
    # 重要な警告表示
    if grep -q "✗" "$REPORT_FILE"; then
        echo ""
        echo "⚠ WARNING: Network issues detected!"
        echo "Please review the report for details."
    fi
}

# ヘルプ表示
show_help() {
    echo "Zabbix Network Diagnostic Tool"
    echo ""
    echo "USAGE:"
    echo "  $0 [mode] [options]"
    echo ""
    echo "MODES:"
    echo "  full        Complete network diagnosis (default)"
    echo "  quick       Basic connectivity check"
    echo "  firewall    Firewall configuration check"
    echo "  performance Network performance analysis"
    echo "  dns         DNS resolution testing"
    echo ""
    echo "OPTIONS:"
    echo "  --fix       Attempt automatic fixes"
    echo "  --help      Show this help"
    echo ""
    echo "EXAMPLES:"
    echo "  $0                    # Full network diagnosis"
    echo "  $0 quick              # Quick connectivity check"
    echo "  $0 full --fix         # Full diagnosis with auto-fix"
}

# スクリプト実行
if [ "${1:-}" = "--help" ] || [ "${1:-}" = "-h" ]; then
    show_help
    exit 0
fi

main "$@"

データベース問題

データベース問題の診断と解決

データベース健康診断システム

sql
-- Zabbixデータベース診断SQLクエリ集

-- 1. データベース基本情報
SELECT 
    'Database Size' as metric,
    ROUND(SUM(data_length + index_length) / 1024 / 1024 / 1024, 2) AS 'Value (GB)'
FROM information_schema.tables 
WHERE table_schema = 'zabbix'

UNION ALL

SELECT 
    'Total Tables' as metric,
    COUNT(*) AS 'Value (GB)'
FROM information_schema.tables 
WHERE table_schema = 'zabbix';

-- 2. 大きなテーブルTop 10
SELECT 
    table_name,
    ROUND(((data_length + index_length) / 1024 / 1024), 2) AS 'Size (MB)',
    table_rows,
    ROUND((index_length / (data_length + index_length)) * 100, 2) AS 'Index Ratio (%)'
FROM information_schema.TABLES 
WHERE table_schema = 'zabbix'
ORDER BY (data_length + index_length) DESC 
LIMIT 10;

-- 3. 履歴テーブル統計
SELECT 
    'history' as table_name,
    COUNT(*) as record_count,
    MIN(FROM_UNIXTIME(clock)) as oldest_record,
    MAX(FROM_UNIXTIME(clock)) as newest_record,
    ROUND((COUNT(*) * 24) / 1024 / 1024, 2) as 'Est Size (MB)'
FROM history

UNION ALL

SELECT 
    'history_uint' as table_name,
    COUNT(*) as record_count,
    MIN(FROM_UNIXTIME(clock)) as oldest_record,
    MAX(FROM_UNIXTIME(clock)) as newest_record,
    ROUND((COUNT(*) * 20) / 1024 / 1024, 2) as 'Est Size (MB)'
FROM history_uint

UNION ALL

SELECT 
    'history_str' as table_name,
    COUNT(*) as record_count,
    MIN(FROM_UNIXTIME(clock)) as oldest_record,
    MAX(FROM_UNIXTIME(clock)) as newest_record,
    ROUND((COUNT(*) * 255) / 1024 / 1024, 2) as 'Est Size (MB)'
FROM history_str;

-- 4. パフォーマンス統計
SHOW STATUS LIKE 'Connections';
SHOW STATUS LIKE 'Threads_connected';
SHOW STATUS LIKE 'Questions';
SHOW STATUS LIKE 'Slow_queries';
SHOW STATUS LIKE 'Innodb_buffer_pool_read_requests';
SHOW STATUS LIKE 'Innodb_buffer_pool_reads';

-- 5. インデックス効率性確認
SELECT 
    s.table_name,
    s.index_name,
    s.cardinality,
    ROUND((s.cardinality / t.table_rows) * 100, 2) as selectivity_percent
FROM information_schema.statistics s
JOIN information_schema.tables t ON s.table_schema = t.table_schema 
    AND s.table_name = t.table_name
WHERE s.table_schema = 'zabbix'
    AND s.cardinality IS NOT NULL
    AND t.table_rows > 1000
ORDER BY selectivity_percent DESC;

-- 6. 設定確認
SHOW VARIABLES LIKE 'innodb_buffer_pool_size';
SHOW VARIABLES LIKE 'innodb_log_file_size';
SHOW VARIABLES LIKE 'query_cache_size';
SHOW VARIABLES LIKE 'max_connections';
SHOW VARIABLES LIKE 'tmp_table_size';
SHOW VARIABLES LIKE 'max_heap_table_size';

まとめ

体系的なZabbixトラブルシューティングアプローチにより、問題の迅速な特定・解決と安定した監視環境の維持が可能となります。

重要ポイント

  1. 体系的診断: 構造化されたアプローチによる効率的な問題特定
  2. 自動化: スクリプトとツールによる診断・解決の自動化
  3. 予防的対策: 問題発生前の兆候検知と対処
  4. 知識蓄積: トラブルシューティング事例の文書化と共有

次のステップ

次章では、実践的なシナリオ事例について学習し、様々な環境での実装パターンを習得します。


関連リンク