handle graceful exit when OOM,

and when the model couldn't be loaded

handle graceful exit when OOM,
and when the model couldn't be loaded
634acf6e · novelailab · ab64d290 · 634acf6e · 634acf6e · 634acf6e
Commit 634acf6e authored Jul 30, 2022 by novelailab
Hide whitespace changes
Inline Side-by-side

Showing with 13 additions and 6 deletions

gunicorn.conf.py gunicorn.conf.py +2 -1

hydra_node/config.py hydra_node/config.py +8 -2

main.py main.py +3 -3

No files found.
--- a/gunicorn.conf.py
+++ b/gunicorn.conf.py
@@ -2,4 +2,5 @@ workers = 1
 bind = "0.0.0.0:80"
 worker_class = "uvicorn.workers.UvicornWorker"
 timeout = 0
-keep_alive=60
\ No newline at end of file
+keep_alive=60
+pidfile="gunicorn.pid"
\ No newline at end of file
--- a/hydra_node/config.py
+++ b/hydra_node/config.py
@@ -21,7 +21,7 @@ def init_config_model():
        config.amp = True
    elif config.amp == "0":
        config.amp = False
-        
+
    is_dev = ""
    environment = "production"
    if os.environ['DEV'] == "True":
@@ -80,8 +80,14 @@ def init_config_model():
    # Instantiate our actual model.
    load_time = time.time()

+    try:
+        model = StableDiffusionModel(config)
+    except Exception as e:
+        logger.error(f"Failed to load model: {str(e)}")
+        capture_exception(e)
+        #exit gunicorn
+        sys.exit(4)

-    model = StableDiffusionModel(config)

    config.model = model


--- a/main.py
+++ b/main.py
@@ -20,8 +20,8 @@ import base64
 #Initialize model and config
 model, config = init_config_model()
 logger = config.logger
-#config.mainpid = open("app.pid", "r").read()
-
+config.mainpid = int(open("gunicorn.pid", "r").read())
+mainpid = config.mainpid
 hostname = socket.gethostname()
 sent_first_message = False

@@ -92,7 +92,7 @@ def generate(request: GenerationRequest):
        if "CUDA out of memory" in e_s or \
                "an illegal memory access" in e_s or "CUDA" in e_s:
            logger.error("GPU error, committing seppuku.")
-            os.kill(os.getpid(), signal.SIGTERM)
+            os.kill(mainpid, signal.SIGTERM)
        return {"error": str(e)}

 if __name__ == "__main__":