PHP服务等级协议SLA与可用性监控
PHP服务等级协议SLA与可用性监控SLA监控是运维的核心工作。需要跟踪服务的可用性、响应时间和错误率。今天说说PHP中SLA监控的实现。SLA的关键指标包括可用性、响应时间和错误率。用百分比表示可用性99.9%代表一年不超过8.76小时的停机时间。phpclass SlaMonitor{private string $serviceName;private array $checkEndpoints [];private array $results [];public function __construct(string $serviceName){$this-serviceName $serviceName;}public function addEndpoint(string $name, string $url, string $method GET): void{$this-checkEndpoints[] compact(name, url, method);}public function check(): array{$results [];foreach ($this-checkEndpoints as $endpoint) {$startTime microtime(true);$status up;$error null;try {$ch curl_init($endpoint[url]);curl_setopt_array($ch, [CURLOPT_RETURNTRANSFER true,CURLOPT_TIMEOUT 5,CURLOPT_CONNECTTIMEOUT 2,CURLOPT_NOBODY true,]);curl_exec($ch);$httpCode curl_getinfo($ch, CURLINFO_HTTP_CODE);$error curl_error($ch);curl_close($ch);if ($httpCode 500 || !empty($error)) {$status down;}} catch (\Exception $e) {$status down;$error $e-getMessage();}$responseTime (microtime(true) - $startTime) * 1000;$result [endpoint $endpoint[name],url $endpoint[url],status $status,response_time_ms round($responseTime, 2),error $error,checked_at date(Y-m-d H:i:s),];$results[] $result;$this-recordResult($result);}return $results;}private function recordResult(array $result): void{$this-results[] $result;if (count($this-results) 1000) {array_shift($this-results);}}public function getAvailability(int $minutes 60): float{$recent array_slice($this-results, -$minutes);if (empty($recent)) return 100;$total count($recent);$up count(array_filter($recent, fn($r) $r[status] up));return $total 0 ? round($up / $total * 100, 4) : 0;}public function getAverageResponseTime(int $minutes 60): float{$recent array_slice($this-results, -$minutes);$times array_filter(array_column($recent, response_time_ms), fn($t) $t 0);return !empty($times) ? round(array_sum($times) / count($times), 2) : 0;}public function getErrorRate(int $minutes 60): float{$recent array_slice($this-results, -$minutes);if (empty($recent)) return 0;$errors count(array_filter($recent, fn($r) $r[status] down));return round($errors / count($recent) * 100, 4);}public function getPercentileResponseTime(int $percentile, int $minutes 60): float{$recent array_slice($this-results, -$minutes);$times array_filter(array_column($recent, response_time_ms), fn($t) $t 0);sort($times);$index ceil(count($times) * $percentile / 100) - 1;return $times[max(0, $index)] ?? 0;}public function generateReport(): array{$uptime $this-getAvailability();$uptime9s $this-formatUptime9s($uptime);return [service $this-serviceName,period_minutes 60,availability {$uptime}% ({$uptime9s}),avg_response_time_ms $this-getAverageResponseTime(),p95_response_time_ms $this-getPercentileResponseTime(95),p99_response_time_ms $this-getPercentileResponseTime(99),error_rate {$this-getErrorRate()}%,total_checks count($this-results),];}private function formatUptime9s(float $uptime): string{if ($uptime 99.999) return 五个九;if ($uptime 99.99) return 四个九;if ($uptime 99.9) return 三个九;if ($uptime 99) return 两个九;return 低于SLA;}}$monitor new SlaMonitor(支付服务);$monitor-addEndpoint(支付API, http://localhost:8080/health);$monitor-addEndpoint(数据库, http://localhost:8080/health/db);for ($i 0; $i 10; $i) {$monitor-check();usleep(100000);}print_r($monitor-generateReport());?告警规则的配置和触发phpclass AlertRule{public string $metric;public string $operator;public float $threshold;public string $message;public function __construct(string $metric, string $operator, float $threshold, string $message ){$this-metric $metric;$this-operator $operator;$this-threshold $threshold;$this-message $message;}public function evaluate(float $value): bool{return match ($this-operator) { $value $this-threshold, $value $this-threshold, $value $this-threshold, $value $this-threshold,default false,};}}function checkSlaAlerts(SlaMonitor $monitor, array $rules): array{$alerts [];$report $monitor-generateReport();foreach ($rules as $rule) {$value $report[$rule-metric] ?? 0;if (is_string($value)) continue;if ($rule-evaluate((float)$value)) {$alerts[] [metric $rule-metric,value $value,threshold $rule-threshold,message $rule-message ?: {$rule-metric} {$rule-operator} {$rule-threshold},];}}return $alerts;}$rules [new AlertRule(avg_response_time_ms, , 500, 平均响应时间超过500ms),];$alerts checkSlaAlerts($monitor, $rules);if (!empty($alerts)) {foreach ($alerts as $alert) {echo [告警] {$alert[message]}\n;}} else {echo SLA监控正常\n;}?SLA监控是运维的基础工作。可用性、响应时间、错误率是三个核心指标。定期检查和告警可以在问题影响用户之前及时发现。对于关键业务建议建立多层次的监控体系包括基础设施监控、应用性能监控和用户体验监控。