Prometheus WAL Keeps on Growing Indefinitely

14.1k views Asked by At

Currently, I have a Prometheus v2.20.0 running, and it has an issue that the WAL keeps on growing indefinitely and consuming disk space.

Actually the disk space is not the issue now, but that the WAL folder is not getting cleaned, so if any time Prometheus is restarted, it tries to load the entire WAL into memory.

So for example WAL is now 60GB, and memory is 32GB, so Prometheus keeps on restarting when it gets killed by the OOM, as it consumes the whole server memory of 24 GB.

Here is my current config for it, and please note that I run it using Docker Compose.

   - '--web.enable-admin-api'
   - '--config.file=/etc/prometheus/prometheus.yml'
   - '--web.external-url=https://prometheus.example.com'
   - '--storage.tsdb.path=/var/lib/prometheus'
   - '--storage.tsdb.retention=150d'
   - '--web.console.libraries=/usr/share/prometheus/console_libraries'
   - '--web.console.templates=/usr/share/prometheus/consoles'

So my question is, how I can configure it to do proper checkpointing and cleaning of WAL so it won't keep growing indefinitely?

3

There are 3 answers

0
Wazery On BEST ANSWER

It seems a known bug in Prometheus v2.20.0, and an upgrade to v2.21.0 fixed it. https://github.com/prometheus/prometheus/issues/7955

0
mohamed sambo On

it happened to me once but on dev cluster eks, corrupted wal I execed on the Prometheus instance and

# rm -rf wal/*
# rm -rf chuncks_head/*

Prometheus would come up again and run, but this solution delete all persisted logs

0
PraveenSam On

For those who still experience the same error and are only going to use prometheus - for alerts, can use below code in as the sidecar or service in the instance ( golang ):

apiVersion: v1
kind: ConfigMap
metadata:
  name: server-code
data:
  sam.go: |
    package main

    import (
        "fmt"
        "log"
        "net/http"
        "os"
        "os/exec"
        "strconv"
        "strings"
        "time"
    )

    func delete() {
        folder := []string{"/data/wal", "/data/chunks_head"}
        for i := 0; i < len(folder); i++ {
            e := os.RemoveAll(folder[i])
            fmt.Println(" Removed ", e)
        }
          }
    func create() {
        folder := []string{"/data/wal", "/data/chunks_head"}
        for i := 0; i < len(folder); i++ {
                    _, e := os.Stat(folder[i])
                    if e != nil {
            err1 := os.MkdirAll(folder[i], os.FileMode(0777))
            err := os.Chown(folder[i], 65534, 65534)
            if err != nil || err1 != nil {
                log.Println(err, err1)
                   }
                   }

           }
               _, err_fi := os.Stat("/data/queries.active")
              if os.IsNotExist(err_fi) {
                     fmt.Println("Creating /data/queries.active ")
                     emptyFile, err := os.Create("/data/queries.active")
                     if err != nil {
                         log.Fatal(err)
                         }
                     err_f := os.Chown("/data/queries.active", 65534, 65534)
                     if err_f != nil {
                          log.Println("Ffile is created")
                          emptyFile.Close()
                       }
                   }
              }
    func main() {
        for {
            time.Sleep(1 * time.Second)
            out, err := exec.Command("du", "-sk", "/data/wal").Output()
            if err == nil {
                d := strings.Fields(string(out))[0]
                f := strings.Replace(d, "K", "", 1)
                if f1, e := strconv.Atoi(f); f1 > 5242880 && e == nil {
                    delete()
                                    create()

                } else {
                    fmt.Println("Size is less "+d+" ==>  %q", (time.Now()))
                }

                url := "http://localhost:9090/graph"

                req, _ := http.NewRequest("GET", url, nil)

                res, _ := http.DefaultClient.Do(req)

                                 if res == nil {
                              delete()
                                              create()
                             }



            } else {
                fmt.Printf("Folder %q is not exists  ==>  %q"+"\n", ("/data/wal"), (time.Now()))
                            create()
            }

        }

    }