Refactor code structure for improved readability and maintainability

2025-08-24 12:05:41 +02:00
parent 34f76242e6
commit c5ceea27b4
5 changed files with 4276 additions and 80 deletions
--- a/gradio_app.py
+++ b/gradio_app.py
@@ -2,7 +2,10 @@ import torch
 import torchaudio
 from einops import rearrange
 from stable_audio_tools import get_pretrained_model
+from omegaconf import OmegaConf
+from stable_audio_tools.models.factory import create_model_from_config
 from stable_audio_tools.inference.generation import generate_diffusion_cond
+from safetensors.torch import load_file as load_safetensors
 from pydub import AudioSegment
 import re
 import os
@@ -11,12 +14,55 @@ import gradio as gr

 # Define a function to toggle the visibility of the seed slider
 def toggle_seed_slider(x):
-    seed_slider.visible = not x
+    return gr.Slider(interactive=not x)

 # Define a function to set up the model and device
-def setup_model(model_half):
-    model, model_config = get_pretrained_model("audo/stable-audio-open-1.0")
+def setup_model(model_path, model_half):
+    """
+    Sets up the model and device.
+    Args:
+        model_path (str): Path to a local model .ckpt or .safetensors file. If empty, downloads the default model.
+        model_half (bool): Whether to use float16 half-precision.
+    """
    device = "cuda" if torch.cuda.is_available() else "cpu"
+    
+    # If no path is provided, or path doesn't exist, download the default model
+    if not model_path or not os.path.exists(model_path):
+        if model_path:
+             print(f"Warning: Model path '{model_path}' not found. Falling back to default model.")
+        model_id = "audo/stable-audio-open-1.0"
+        print(f"Loading default model from Hugging Face: {model_id}")
+        model, model_config = get_pretrained_model(model_id)
+    
+    # Otherwise, load the model from the local filesystem
+    else:
+        print(f"Loading local model from: {model_path}")
+        
+        # Find the model_config.json file in the same directory as the model
+        model_dir = os.path.dirname(model_path)
+        config_path = os.path.join(model_dir, "model_config.json")
+
+        if not os.path.exists(config_path):
+            raise FileNotFoundError(f"Error: Could not find 'model_config.json' in the same directory as the model: {model_dir}")
+
+        print(f"Loading model config from: {config_path}")
+        model_config = OmegaConf.load(config_path)
+        
+        # Create the model structure from the config
+        model = create_model_from_config(model_config)
+
+        # Load the weights from the checkpoint
+        if model_path.endswith(".safetensors"):
+            print("Loading weights from .safetensors file.")
+            state_dict = load_safetensors(model_path)
+        elif model_path.endswith(".ckpt"):
+            print("Loading weights from .ckpt file.")
+            state_dict = torch.load(model_path, map_location="cpu")["state_dict"]
+        else:
+            raise ValueError("Unsupported model file type. Please use .safetensors or .ckpt")
+        
+        model.load_state_dict(state_dict)
+
    model = model.to(device)
    
    # Convert model to float16 if model_half is True
@@ -92,7 +138,10 @@ def generate_audio(prompt, steps, cfg_scale, sigma_min, sigma_max, generation_ti

    return full_path

-def audio_generator(prompt, sampler_type, steps, cfg_scale, sigma_min, sigma_max, generation_time, random_seed, seed, model_half):
+def audio_generator(prompt, model_path, sampler_type, steps, cfg_scale, sigma_min, sigma_max, generation_time, random_seed, seed, model_half):
+    """
+    Main function called by the Gradio UI to orchestrate audio generation.
+    """
    try:
        print("Generating audio with parameters:")
        print("Prompt:", prompt)
@@ -107,7 +156,7 @@ def audio_generator(prompt, sampler_type, steps, cfg_scale, sigma_min, sigma_max
        print("Model Half Precision:", model_half)
        
        # Set up the model and device
-        model, model_config, device = setup_model(model_half)
+        model, model_config, device = setup_model(model_path, model_half)
        
        if random_seed:
            seed = torch.randint(0, 1000000, (1,)).item()
@@ -118,51 +167,127 @@ def audio_generator(prompt, sampler_type, steps, cfg_scale, sigma_min, sigma_max
        return str(e)

 # Create Gradio interface
-with gr.Blocks() as demo:
+# with gr.Blocks() as demo:
+#     gr.Markdown("<h1 style='text-align: center; font-size: 300%;'>💀🔊 StableAudioWebUI 💀🔊</h1>")
+
+#     # Main input components
+#     prompt_textbox = gr.Textbox(lines=5, label="Prompt")
+#     sampler_dropdown = gr.Dropdown(
+#         label="Sampler Type",
+#         choices=[
+#             "dpmpp-3m-sde",
+#             "dpmpp-2m-sde",
+#             "k-heun",
+#             "k-lms",
+#             "k-dpmpp-2s-ancestral",
+#             "k-dpm-2",
+#             "k-dpm-fast"
+#         ],
+#         value="dpmpp-3m-sde"
+#     )
+#     steps_slider = gr.Slider(minimum=0, maximum=200, label="Steps", step=1, value=100)
+#     generation_time_slider = gr.Slider(minimum=0, maximum=47, label="Generation Time (seconds)", step=1, value=47)
+#     random_seed_checkbox = gr.Checkbox(label="Random Seed")
+#     seed_slider = gr.Slider(minimum=-1, maximum=999999, label="Seed", step=1, value=123456)
+
+#     # Advanced parameters accordion
+#     with gr.Accordion("Advanced Parameters", open=False):
+#         cfg_scale_slider = gr.Slider(minimum=0, maximum=15, label="CFG Scale", step=0.1, value=7)
+#         sigma_min_slider = gr.Slider(minimum=0, maximum=50, label="Sigma Min", step=0.1, value=0.3)
+#         sigma_max_slider = gr.Slider(minimum=0, maximum=1000, label="Sigma Max", step=0.1, value=500)
+
+#     # Low VRAM checkbox and submit button
+#     model_half_checkbox = gr.Checkbox(label="Low VRAM (float16)", value=False)
+#     submit_button = gr.Button("Generate")
+
+#     # Define the output components
+#     audio_output = gr.Audio()
+#     output_textbox = gr.Textbox(label="Output")
+
+#     # Link the button and the function
+#     random_seed_checkbox.change(fn=toggle_seed_slider, inputs=[random_seed_checkbox], outputs=[seed_slider])
+#     submit_button.click(audio_generator,
+#                         inputs=[prompt_textbox, sampler_dropdown, steps_slider, cfg_scale_slider,sigma_min_slider, sigma_max_slider, generation_time_slider, random_seed_checkbox, seed_slider, model_half_checkbox],
+#                         outputs=[audio_output, output_textbox])
+
+#     # GitHub link at the bottom
+#     gr.Markdown("<p style='text-align: center;'><a href='https://github.com/Saganaki22/StableAudioWebUI'>Github Repository</a></p>")
+
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("<h1 style='text-align: center; font-size: 300%;'>💀🔊 StableAudioWebUI 💀🔊</h1>")

-    # Main input components
-    prompt_textbox = gr.Textbox(lines=5, label="Prompt")
-    sampler_dropdown = gr.Dropdown(
-        label="Sampler Type",
-        choices=[
-            "dpmpp-3m-sde",
-            "dpmpp-2m-sde",
-            "k-heun",
-            "k-lms",
-            "k-dpmpp-2s-ancestral",
-            "k-dpm-2",
-            "k-dpm-fast"
-        ],
-        value="dpmpp-3m-sde"
-    )
-    steps_slider = gr.Slider(minimum=0, maximum=200, label="Steps", step=1, value=100)
-    generation_time_slider = gr.Slider(minimum=0, maximum=47, label="Generation Time (seconds)", step=1, value=47)
-    random_seed_checkbox = gr.Checkbox(label="Random Seed")
-    seed_slider = gr.Slider(minimum=-1, maximum=999999, label="Seed", step=1, value=123456)
+    with gr.Row():
+        with gr.Column(scale=2):
+            # Main input components
+            prompt_textbox = gr.Textbox(lines=5, label="Prompt", placeholder="A beautiful orchestral piece with violins, piano, and a choir...")
+            
+            # NEW: Textbox for local model path
+            model_path_textbox = gr.Textbox(
+                label="Local Model Path (Optional)",
+                placeholder="e.g., /home/user/models/stable-audio-open-1.0.ckpt. Leave blank for default."
+            )

-    # Advanced parameters accordion
-    with gr.Accordion("Advanced Parameters", open=False):
-        cfg_scale_slider = gr.Slider(minimum=0, maximum=15, label="CFG Scale", step=0.1, value=7)
-        sigma_min_slider = gr.Slider(minimum=0, maximum=50, label="Sigma Min", step=0.1, value=0.3)
-        sigma_max_slider = gr.Slider(minimum=0, maximum=1000, label="Sigma Max", step=0.1, value=500)
+            sampler_dropdown = gr.Dropdown(
+                label="Sampler Type",
+                choices=[
+                    "dpmpp-3m-sde",
+                    "dpmpp-2m-sde",
+                    "k-heun",
+                    "k-lms",
+                    "k-dpmpp-2s-ancestral",
+                    "k-dpm-2",
+                    "k-dpm-fast"
+                ],
+                value="dpmpp-3m-sde"
+            )
+            
+            with gr.Row():
+                steps_slider = gr.Slider(minimum=10, maximum=200, label="Steps", step=1, value=100)
+                generation_time_slider = gr.Slider(minimum=1, maximum=47, label="Generation Time (seconds)", step=1, value=47)

-    # Low VRAM checkbox and submit button
-    model_half_checkbox = gr.Checkbox(label="Low VRAM (float16)", value=False)
-    submit_button = gr.Button("Generate")
+            with gr.Row():
+                random_seed_checkbox = gr.Checkbox(label="Random Seed", value=True)
+                seed_slider = gr.Slider(minimum=-1, maximum=999999, label="Seed", step=1, value=12345, interactive=False)

-    # Define the output components
-    audio_output = gr.Audio()
-    output_textbox = gr.Textbox(label="Output")
+            # Advanced parameters accordion
+            with gr.Accordion("Advanced Parameters", open=False):
+                cfg_scale_slider = gr.Slider(minimum=0, maximum=25, label="CFG Scale", step=0.1, value=7)
+                sigma_min_slider = gr.Slider(minimum=0.01, maximum=50, label="Sigma Min", step=0.01, value=0.3)
+                sigma_max_slider = gr.Slider(minimum=1, maximum=1000, label="Sigma Max", step=1, value=500)
+
+            # Low VRAM checkbox and submit button
+            model_half_checkbox = gr.Checkbox(label="Low VRAM (float16)", value=False)
+            submit_button = gr.Button("Generate", variant="primary")
+
+        with gr.Column(scale=1):
+            # Define the output components
+            audio_output = gr.Audio(label="Generated Audio")
+            output_textbox = gr.Textbox(label="Status", interactive=False)

    # Link the button and the function
    random_seed_checkbox.change(fn=toggle_seed_slider, inputs=[random_seed_checkbox], outputs=[seed_slider])
-    submit_button.click(audio_generator,
-                        inputs=[prompt_textbox, sampler_dropdown, steps_slider, cfg_scale_slider,sigma_min_slider, sigma_max_slider, generation_time_slider, random_seed_checkbox, seed_slider, model_half_checkbox],
-                        outputs=[audio_output, output_textbox])
+    
+    # MODIFIED: Added model_path_textbox to the list of inputs
+    submit_button.click(
+        fn=audio_generator,
+        inputs=[
+            prompt_textbox, 
+            model_path_textbox,
+            sampler_dropdown, 
+            steps_slider, 
+            cfg_scale_slider,
+            sigma_min_slider, 
+            sigma_max_slider, 
+            generation_time_slider, 
+            random_seed_checkbox, 
+            seed_slider, 
+            model_half_checkbox
+        ],
+        outputs=[audio_output, output_textbox]
+    )

    # GitHub link at the bottom
-    gr.Markdown("<p style='text-align: center;'><a href='https://github.com/Saganaki22/StableAudioWebUI'>Github Repository</a></p>")
+    gr.Markdown("<p style='text-align: center;'><a href='https://github.com/Saganaki22/StableAudioWebUI' target='_blank'>Github Repository</a></p>")

 # Launch the Gradio demo
 demo.launch()