Upload 3 files

Browse files

Files changed (3) hide show

demo.py +59 -0
ebanyvae.pt +3 -0
ebanyvae.py +269 -0

demo.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import gradio as gr
+import torch
+import torchaudio
+import os
+import numpy as np
+from ebanyvae import EbanyCodec, CodecConfig
+WEIGHTS_FILE = "ebanyvae.pt"
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+def init_engine():
+    codec = EbanyCodec()
+    if os.path.exists(WEIGHTS_FILE):
+        try:
+            params = torch.load(WEIGHTS_FILE, map_location="cpu")
+            codec.load_state_dict(params, strict=True)
+        except Exception:
+            pass
+    codec.to(DEVICE)
+    codec.eval()
+    return codec
+processor = init_engine()
+def process_signal(input_file):
+    if input_file is None:
+        return None
+    try:
+        signal, fs = torchaudio.load(input_file)
+        internal_sr = processor.cfg.sr
+        if fs != internal_sr:
+            resampler = torchaudio.transforms.Resample(orig_freq=fs, new_freq=internal_sr)
+            signal = resampler(signal)
+        if signal.shape[0] > 1:
+            signal = signal.mean(dim=0, keepdim=True)
+        input_tensor = signal.unsqueeze(0).to(DEVICE)
+        with torch.no_grad():
+            z = processor.encode(input_tensor, internal_sr)
+            out_tensor = processor.decode(z)
+        audio_out = out_tensor.squeeze().cpu().float().numpy()
+        return (internal_sr, audio_out)
+    except Exception:
+        return None
+theme = gr.themes.Soft()
+with gr.Blocks(theme=theme, title="Neural Audio Processor") as interface:
+    gr.Markdown("### Neural Codec Reconstruction Test")
+    with gr.Row():
+        with gr.Column():
+            audio_in = gr.Audio(type="filepath", label="Source Signal")
+            run_btn = gr.Button("Process Signal", variant="primary")
+        with gr.Column():
+            audio_out = gr.Audio(label="Synthesized Output")
+    run_btn.click(fn=process_signal, inputs=audio_in, outputs=audio_out)
+if __name__ == "__main__":
+    interface.launch(server_name="0.0.0.0", share=True)

ebanyvae.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:75640ec86cde3e0ccf2109e49d4b919d6682c5e3458d042311abb432b907c77e
+size 346029598

ebanyvae.py ADDED Viewed

	@@ -0,0 +1,269 @@

+import math
+from functools import partial
+from typing import List, Optional, Tuple, Dict
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils import weight_norm
+from pydantic import BaseModel
+class WeightNormWrapper(nn.Module):
+    @staticmethod
+    def wrap(module):
+        return weight_norm(module)
+class SineAct(nn.Module):
+    def __init__(self, channels: int):
+        super().__init__()
+        self.freq_param = nn.Parameter(torch.ones(1, channels, 1))
+    def forward(self, input_tensor: torch.Tensor) -> torch.Tensor:
+        b, c, t = input_tensor.shape
+        flat_x = input_tensor.reshape(b, c, -1)
+        recip_alpha = 1.0 / (self.freq_param + 1e-9)
+        sine_part = torch.square(torch.sin(self.freq_param * flat_x))
+        out = flat_x + recip_alpha * sine_part
+        return out.reshape(b, c, t)
+class TemporalConv(nn.Conv1d):
+    def __init__(self, *args, pad_val: int = 0, **kwargs):
+        if 'padding' in kwargs:
+            kwargs['padding'] = 0
+        super().__init__(*args, **kwargs)
+        self.pad_val = pad_val
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.pad_val > 0:
+            x = F.pad(x, (self.pad_val * 2, 0))
+        return super().forward(x)
+class TemporalTransposeConv(nn.ConvTranspose1d):
+    def __init__(self, *args, pad_val: int = 0, out_pad: int = 0, **kwargs):
+        if 'padding' in kwargs:
+            kwargs['padding'] = 0
+        if 'output_padding' in kwargs:
+            kwargs['output_padding'] = 0
+        super().__init__(*args, **kwargs)
+        self.pad_val = pad_val
+        self.out_pad = out_pad
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = super().forward(x)
+        trim = self.pad_val * 2 - self.out_pad
+        if trim > 0:
+            return out[..., :-trim]
+        return out
+def get_normed_conv(in_c, out_c, k, d=1, p=0, g=1, s=1, bias=True):
+    return weight_norm(
+        TemporalConv(
+            in_c, out_c,
+            kernel_size=k,
+            stride=s,
+            padding=p,
+            dilation=d,
+            groups=g,
+            bias=bias,
+            pad_val=p
+        )
+    )
+def get_normed_transpose(in_c, out_c, k, s, p, op):
+    return weight_norm(
+        TemporalTransposeConv(
+            in_c, out_c,
+            kernel_size=k,
+            stride=s,
+            padding=p,
+            output_padding=op,
+            pad_val=p,
+            out_pad=op
+        )
+    )
+class ResidualUnit(nn.Module):
+    def __init__(self, channels: int, dilation_rate: int, kernel: int = 7, groups: int = 1):
+        super().__init__()
+        effective_padding = ((kernel - 1) * dilation_rate) // 2
+        self.ops = nn.Sequential(
+            SineAct(channels),
+            get_normed_conv(channels, channels, k=kernel, d=dilation_rate, p=effective_padding, g=groups),
+            SineAct(channels),
+            get_normed_conv(channels, channels, k=1)
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        identity = x
+        out = self.ops(x)
+        diff = identity.shape[-1] - out.shape[-1]
+        if diff > 0:
+            pad_trim = diff // 2
+            identity = identity[..., pad_trim:-pad_trim]
+        return identity + out
+class EncoderStep(nn.Module):
+    def __init__(self, out_ch: int, in_ch: Optional[int] = None, factor: int = 1, groups: int = 1):
+        super().__init__()
+        in_ch = in_ch or out_ch // 2
+        res_stack = [
+            ResidualUnit(in_ch, dilation_rate=d, groups=groups)
+            for d in [1, 3, 9]
+        ]
+        downsampler = [
+            SineAct(in_ch),
+            get_normed_conv(
+                in_ch,
+                out_ch,
+                k=2 * factor,
+                s=factor,
+                p=math.ceil(factor / 2)
+            )
+        ]
+        self.ops = nn.Sequential(*res_stack, *downsampler)
+    def forward(self, x):
+        return self.ops(x)
+class LatentEncoder(nn.Module):
+    def __init__(self, base_ch: int = 64, z_dim: int = 32, ratios: list = [2, 4, 8, 8], is_depthwise: bool = False):
+        super().__init__()
+        self.layers = nn.ModuleList()
+        self.layers.append(get_normed_conv(1, base_ch, k=7, p=3))
+        current_ch = base_ch
+        for r in ratios:
+            current_ch *= 2
+            grp = current_ch // 2 if is_depthwise else 1
+            self.layers.append(EncoderStep(out_ch=current_ch, factor=r, groups=grp))
+        self.calc_mu = get_normed_conv(current_ch, z_dim, k=3, p=1)
+        self.calc_logvar = get_normed_conv(current_ch, z_dim, k=3, p=1)
+        self.layers = nn.Sequential(*self.layers)
+    def forward(self, x):
+        h = self.layers(x)
+        return {
+            "h": h,
+            "mean": self.calc_mu(h),
+            "logvar": self.calc_logvar(h)
+        }
+class StochasticInjector(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.proj = weight_norm(
+            TemporalConv(dim, dim, kernel_size=1, bias=False, pad_val=0)
+        )
+    def forward(self, x):
+        noise = torch.randn_like(x[:, :1, :])
+        modulator = self.proj(x)
+        return x + (noise * modulator)
+class DecoderStep(nn.Module):
+    def __init__(self, in_ch: int, out_ch: int, factor: int, groups: int = 1, noise: bool = False):
+        super().__init__()
+        stack = [
+            SineAct(in_ch),
+            get_normed_transpose(
+                in_ch,
+                out_ch,
+                k=2 * factor,
+                s=factor,
+                p=math.ceil(factor / 2),
+                op=factor % 2
+            )
+        ]
+        if noise:
+            stack.append(StochasticInjector(out_ch))
+        for d in [1, 3, 9]:
+            stack.append(ResidualUnit(out_ch, dilation_rate=d, groups=groups))
+        self.ops = nn.Sequential(*stack)
+    def forward(self, x):
+        return self.ops(x)
+class LatentDecoder(nn.Module):
+    def __init__(self, z_dim, start_ch, ratios, is_depthwise=False, out_channels=1, use_noise=False):
+        super().__init__()
+        sequence = []
+        if is_depthwise:
+            sequence.extend([
+                get_normed_conv(z_dim, z_dim, k=7, p=3, g=z_dim),
+                get_normed_conv(z_dim, start_ch, k=1)
+            ])
+        else:
+            sequence.append(get_normed_conv(z_dim, start_ch, k=7, p=3))
+        for i, r in enumerate(ratios):
+            dim_in = start_ch // (2 ** i)
+            dim_out = start_ch // (2 ** (i + 1))
+            grp = dim_out if is_depthwise else 1
+            sequence.append(
+                DecoderStep(dim_in, dim_out, factor=r, groups=grp, noise=use_noise)
+            )
+        final_dim = dim_out
+        sequence.extend([
+            SineAct(final_dim),
+            get_normed_conv(final_dim, out_channels, k=7, p=3),
+            nn.Tanh()
+        ])
+        self.sequence = nn.Sequential(*sequence)
+    def forward(self, x):
+        return self.sequence(x)
+class CodecConfig(BaseModel):
+    enc_dim: int = 64
+    enc_ratios: List[int] = [2, 3, 6, 7, 7]
+    z_dim: int = 64
+    dec_dim: int = 2048
+    dec_ratios: List[int] = [7, 7, 6, 3, 2]
+    depthwise_conv: bool = True
+    sr: int = 44100
+    noise_injection: bool = False
+class EbanyCodec(nn.Module):
+    def __init__(self, cfg: Optional[CodecConfig] = None):
+        if cfg is None:
+            cfg = CodecConfig()
+        super().__init__()
+        self.cfg = cfg
+        if self.cfg.z_dim is None:
+            calc_dim = self.cfg.enc_dim * (2 ** len(self.cfg.enc_ratios))
+        else:
+            calc_dim = self.cfg.z_dim
+        self.encoder = LatentEncoder(
+            base_ch=self.cfg.enc_dim,
+            z_dim=calc_dim,
+            ratios=self.cfg.enc_ratios,
+            is_depthwise=self.cfg.depthwise_conv
+        )
+        self.decoder = LatentDecoder(
+            z_dim=calc_dim,
+            start_ch=self.cfg.dec_dim,
+            ratios=self.cfg.dec_ratios,
+            is_depthwise=self.cfg.depthwise_conv,
+            use_noise=self.cfg.noise_injection
+        )
+        self.hop = math.prod(self.cfg.enc_ratios)
+    def _pad_audio(self, wav):
+        total = wav.shape[-1]
+        remainder = total % self.hop
+        if remainder != 0:
+            missing = self.hop - remainder
+            wav = F.pad(wav, (0, missing))
+        return wav
+    def encode(self, wav: torch.Tensor, sr: int = None):
+        if wav.ndim == 2:
+            wav = wav.unsqueeze(1)
+        wav = self._pad_audio(wav)
+        res = self.encoder(wav)
+        return res["mean"]
+    def decode(self, latents: torch.Tensor):
+        return self.decoder(latents)
+    def forward(self, x, sr=None):
+        z = self.encode(x, sr)
+        return self.decode(z)