Unity Integration for Stable Diffusion & ChatGPT for AI Image + Speech Generation

There are other tools out there that allow the integration of Stable Diffusion, ChatGPT, and Text to Speech (TTS) into Unity, but this guide is intended to make integrating them all very simple. The below script doesn’t have the best coding practices, but it is an easy place to start with Unity AI integrations.

Start By Getting All the Service Keys

You of course will have to get your own keys from each service:

Get the Unity AI Manager Script

At the bottom I have listed the full script out so you can peruse it. Or you can just download it here:

AIManager.cs_Download

How to use the AI Manager Unity Script

Once you’ve dropped the script into the Unity assets folder, it is good to understand what a UnityEvent is. TLDR: It is a set of scripts that will be called whenever the event happens, or is ‘invoked.’ Let’s look at how that works in the test case in the script.

You first need a UnityEvent for the given type of response you are expecting from the AIs. If we take the AudioClip as an example, we give the GetVoice call a string that describes the phrase we want spoken, and a UnityEvent<AudioClip> that will be called when the clip is ready.

But the UnityEvent<Sprite> doesn’t do anything on its own. You won’t get an error, but you also won’t get a voice saying something. We need to say what is done with the AudioClip. Here we add the function testAudioClip. Once we’ve added the function, every time we use the UnityEvent audioClipCallback our script testAudioClip will fire. That script will find and AudioSource if there is one, and if not it will create one. Then it will set the clip for the AudioSource and play it.

I recommend making your own UnityEvents and then feeding those events from each script into the AIManger.Instance.GetSomething(input, callbackEvent).

The Outputs

The Stable Diffusion outputs from the Runpod.io API can be good, but they won’t be as good as ones you edit and refine in the automatic1111 UI, which you can use on Runpod.io servers. Here are some example images from the runpod.io API Unity AI script.

runpod.io generated image using the unity ai script

The Unity AI Manager Script

using System;
using System.Collections;
using System.Collections.Generic;
using System.Text.RegularExpressions;
using UnityEngine;
using UnityEngine.Networking;
using UnityEngine.UI;
using UnityEngine.Events;
using System.Text;


#region Classes
[Serializable]
public class ChatResponse
{
   public string id;
   public string model;
   public ChatOption[] choices;
}


[Serializable]
public class ChatOption
{
   public ChatMessage message;
   public int index;
   public string finish_reason;
}
[Serializable]
public class ChatMessage
{
   public string role;
   public string content;
}




[Serializable]
public class ImageStatus
{
   public string id;
   public string status;
   public ImageResult[] output;
}


[Serializable]
public class ImageResult
{
   public int seed;
   public string image;
}


public enum AudioVoice
{
   MollyNeural,
   AnnetteNeural,
   CarlyNeural,
   DarrenNeural,
   DuncanNeural,
   ElsieNeural,
   FreyaNeural,
   JoanneNeural,
   KenNeural,
   KimNeural,
   NatashaNeural,
   NeilNeural,
   TimNeural,
   TinaNeural,
   WilliamNeural
}


[Serializable]
public class VoiceParams
{
   public string speed = "20%";
   public AudioVoice voice = AudioVoice.MollyNeural;
   public string pitch = "-5%";
}
#endregion


public class AIManager : MonoBehaviour
{
    public static AIManager Instance;

    public bool test = false;
    public bool gpt4 = false;

    public UnityEvent<string> textCallback;
    public UnityEvent<Sprite> spriteCallback;
    public UnityEvent<Texture> textureCallback;
    public UnityEvent<AudioClip> audioClipCallback;
    public UnityEvent<byte[]> wavCallback;

    private void Awake()
    {
        Instance = this;
    }

    private void Start()
    {
        if(test)
        {
            string testString = "This is a test.";

            textCallback.AddListener(testText);
            spriteCallback.AddListener(testSprite);
            textureCallback.AddListener(testTexture);
            audioClipCallback.AddListener(testAudioClip);
            wavCallback.AddListener(testWav);

            GetText(testString, textCallback);
            GetImage(testString, spriteCallback);
            GetImage(testString, textureCallback);
            GetVoice(testString, audioClipCallback, wavCallback);
        }
    }
    private void testText(string gptResponse)
    {
        Debug.Log("We got the response from GPT: " + gptResponse);
    }
    private void testSprite(Sprite sprite)
    {
        Debug.Log("We got the sprite back");
    }
    private void testTexture(Texture texture)
    {
        Debug.Log("We got the texture back");
    }
    private void testAudioClip(AudioClip clip)
    {
        Debug.Log("We got the clip back");
        AudioSource source = gameObject.GetComponent<AudioSource>();
        if(source == null)
        {
            source = gameObject.AddComponent<AudioSource>();
        }
        source.clip = clip;
        source.Play();
    }
    private void testWav(byte[] bytes)
    {
        Debug.Log("We got the bytes back and it has a length of " + bytes.Length);
    }
#region Text
   public void GetText(string input, UnityEvent<string> callback)
   {
       StartCoroutine(GetTextCoroutine(input, callback));
   }
   private IEnumerator GetTextCoroutine(string input, UnityEvent<string> callback)
   {
       string submitText = cleanForJSON(input);
       string url = "https://api.openai.com/v1/chat/completions";
       string model = gpt4 ? "gpt-4" : "gpt-3.5-turbo";
       string secret = ""; //TODO put your secret here!  Gpt4 is only if you have access, also it is MUCH more expensive and 5x slower.
       string json = "{" +
           "\"messages\": [{\"role\": \"user\", \"content\": \""+submitText+"\"}]," +
           "\"temperature\": 0.7," +
           "\"max_tokens\": 2000," +
           "\"top_p\": 1," +
           "\"frequency_penalty\": 0.3," +
           "\"presence_penalty\": 0.3," +
           "\"model\": \""+model+"\"" +
           "}";
       Debug.Log(json);
       using(UnityWebRequest request = new UnityWebRequest(url))
       {
           request.uploadHandler = new UploadHandlerRaw(System.Text.Encoding.UTF8.GetBytes(json));
           request.downloadHandler = new DownloadHandlerBuffer();
           request.method = UnityWebRequest.kHttpVerbPOST;
           request.SetRequestHeader("Content-Type", "application/json");
           request.SetRequestHeader("Authorization", "Bearer " + secret);
           request.disposeUploadHandlerOnDispose = true;
           request.disposeDownloadHandlerOnDispose = true;
           yield return request.SendWebRequest();


           if (request.result != UnityWebRequest.Result.Success)
           {
               Debug.LogError(request.error);
           }
           else
           {
               ChatResponse chatResponse = JsonUtility.FromJson<ChatResponse>(request.downloadHandler.text);
               callback.Invoke(chatResponse.choices[0].message.content);
           }
       }
   }
#endregion
#region Images
   public void GetImage(string input, UnityEvent<Sprite> callback)
   {
       StartCoroutine(GetImage(input, callback, null));
   }
   public void GetImage(string input, UnityEvent<Texture> callback)
   {
       StartCoroutine(GetImage(input, null, callback));
   }
   private IEnumerator GetImage(string input, UnityEvent<Sprite> callbackSprite, UnityEvent<Texture> callbackTexture)
   {
       input = cleanForJSON(input);
       int seed = Mathf.FloorToInt(UnityEngine.Random.value * 10000);
       string url = "https://api.runpod.ai/v1/stable-diffusion-v1/run";
       string key = ""; //TODO put your Runpod.io key here!  This comes out to ~$0.0001 per image
       string json = "{" +
           "\"input\": {\"prompt\": \"" +
           input + "\"" +
           ((", \"seed\": " + seed)) +
           ", \"negative_prompt\": \"big boobs\"" + //this corrects for most female oversexualization
           ", \"num_outputs\": 1" +
           ", \"num_inference_steps\": 20" +
           "}}";
       using(UnityWebRequest request = new UnityWebRequest(url))
       {
           request.uploadHandler = new UploadHandlerRaw(System.Text.Encoding.UTF8.GetBytes(json));
           request.downloadHandler = new DownloadHandlerBuffer();
           request.method = UnityWebRequest.kHttpVerbPOST;
           request.SetRequestHeader("Content-Type", "application/json");
           request.SetRequestHeader("Authorization", "Bearer " + key);
           request.disposeUploadHandlerOnDispose = true;
           request.disposeDownloadHandlerOnDispose = true;
           yield return request.SendWebRequest();


           ImageStatus response = JsonUtility.FromJson<ImageStatus>(request?.downloadHandler?.text);
           if (request.result != UnityWebRequest.Result.Success)
           {
               Debug.LogError(input + ": " + request.error);
           }
           else
           {
               response = JsonUtility.FromJson<ImageStatus>(request.downloadHandler.text);
           }
           url = "https://api.runpod.ai/v1/stable-diffusion-v1/status/" + response.id;


           while(response.output == null)
           {
               yield return new WaitForSeconds(2f);
               using(UnityWebRequest request2 = new UnityWebRequest(url))
               {
                   request2.uploadHandler = new UploadHandlerRaw(System.Text.Encoding.UTF8.GetBytes(json));
                   request2.downloadHandler = new DownloadHandlerBuffer();
                   request2.method = UnityWebRequest.kHttpVerbPOST;
                   request2.SetRequestHeader("Content-Type", "application/json");
                   request2.SetRequestHeader("Authorization", "Bearer " + key);
                   request2.disposeUploadHandlerOnDispose = true;
                   request2.disposeDownloadHandlerOnDispose = true;
                   yield return request2.SendWebRequest();
                   if (request2.result != UnityWebRequest.Result.Success)
                   {
                       Debug.LogError(input + ": " + request2.error);
                   }
                   else
                   {
                       response = JsonUtility.FromJson<ImageStatus>(request2.downloadHandler.text);
                   }
               }
           }


           url = response.output[0].image;
           using(UnityWebRequest wwwTex = UnityWebRequestTexture.GetTexture(url))
           {
               yield return wwwTex.SendWebRequest();
              
               if (wwwTex.result != UnityWebRequest.Result.Success)
               {
                   Debug.LogError(input + ": " + wwwTex.error);
               }
               else
               {
                   Texture myTexture = ((DownloadHandlerTexture) wwwTex.downloadHandler).texture;
                   if(callbackSprite != null)
                   {
                       callbackSprite.Invoke(Sprite.Create((Texture2D)myTexture, new Rect(0.0f, 0.0f, myTexture.width, myTexture.height), new Vector2(0.5f, 0.5f)));
                   }
                   if(callbackTexture  != null)
                   {
                       callbackTexture.Invoke(myTexture);
                   }
               }
           }
       }
   }
   #endregion
#region Voice
   public static readonly string FetchTokenUri =
       "https://eastus.api.cognitive.microsoft.com/sts/v1.0/issuetoken";
   private string token = "";


   private Dictionary<AudioVoice, string> voices = new Dictionary<AudioVoice, string>()
   {
       {AudioVoice.MollyNeural, "<voice xml:lang='en-NZ' xml:gender='Female' name='en-NZ-MollyNeural'>"},
       {AudioVoice.AnnetteNeural, "<voice xml:lang='en-AU' xml:gender='Female' name='en-AU-AnnetteNeural'>"},
       {AudioVoice.CarlyNeural, "<voice xml:lang='en-AU' xml:gender='Female' name='en-AU-CarlyNeural'>"},
       {AudioVoice.ElsieNeural, "<voice xml:lang='en-AU' xml:gender='Female' name='en-AU-ElsieNeural'>"},
       {AudioVoice.FreyaNeural, "<voice xml:lang='en-AU' xml:gender='Female' name='en-AU-FreyaNeural'>"},
       {AudioVoice.JoanneNeural, "<voice xml:lang='en-AU' xml:gender='Female' name='en-AU-JoanneNeural'>"},
       {AudioVoice.KimNeural, "<voice xml:lang='en-AU' xml:gender='Female' name='en-AU-KimNeural'>"},
       {AudioVoice.NatashaNeural, "<voice xml:lang='en-AU' xml:gender='Female' name='en-AU-NatashaNeural'>"},
       {AudioVoice.TinaNeural, "<voice xml:lang='en-AU' xml:gender='Female' name='en-AU-TinaNeural'>"},
       {AudioVoice.DarrenNeural, "<voice xml:lang='en-AU' xml:gender='Male' name='en-AU-DarrenNeural'>"},
       {AudioVoice.DuncanNeural, "<voice xml:lang='en-AU' xml:gender='Male' name='en-AU-DuncanNeural'>"},
       {AudioVoice.KenNeural, "<voice xml:lang='en-AU' xml:gender='Male' name='en-AU-KenNeural'>"},
       {AudioVoice.NeilNeural, "<voice xml:lang='en-AU' xml:gender='Male' name='en-AU-NeilNeural'>"},
       {AudioVoice.TimNeural, "<voice xml:lang='en-AU' xml:gender='Male' name='en-AU-TimNeural'>"},
       {AudioVoice.WilliamNeural, "<voice xml:lang='en-AU' xml:gender='Male' name='en-AU-WilliamNeural'>"},
   };


   public void Authentication(string subscriptionKey, string input, UnityEvent<AudioClip> callback, UnityEvent<byte[]> callbackWav, VoiceParams voiceParams)
   {
       StartCoroutine(FetchToken(FetchTokenUri, subscriptionKey, input, callback, callbackWav, voiceParams));
   }


   IEnumerator FetchToken(string url, string subscriptionKey, string input, UnityEvent<AudioClip> callback, UnityEvent<byte[]> callbackWav, VoiceParams voiceParams)
   {
       UnityWebRequest request = new UnityWebRequest(url);
       request.SetRequestHeader("Ocp-Apim-Subscription-Key", subscriptionKey);
       request.SetRequestHeader("Content-type", "application/x-www-form-urlencoded");
       request.uploadHandler = new UploadHandlerRaw(new byte[0]);
       request.downloadHandler = new DownloadHandlerBuffer();
       request.method = UnityWebRequest.kHttpVerbPOST;
       yield return request.SendWebRequest();


       if (request.result != UnityWebRequest.Result.Success)
       {
           Debug.Log(request.error);
       }
       else
       {
           token = request.downloadHandler.text;
           StartCoroutine(GetVoiceCoroutine(input, callback, callbackWav, voiceParams));
           request.Dispose();
       }
   }
   public void GetVoice(string input, UnityEvent<AudioClip> callback, UnityEvent<byte[]> callbackWav, VoiceParams voiceParams = null)
   {
       if(token == "")
       {           
           string key = ""; //TODO: put your Microsoft Azure key here, good bit free per month
           Authentication(key, input, callback, callbackWav, voiceParams);
       }
       else
       {
           StartCoroutine(GetVoiceCoroutine(input, callback, callbackWav, voiceParams));
       }
   }
   private IEnumerator GetVoiceCoroutine(string input, UnityEvent<AudioClip> callback, UnityEvent<byte[]> callbackWav, VoiceParams voiceParams = null)
   {
       /*
       POST /cognitiveservices/v1 HTTP/1.1


       X-Microsoft-OutputFormat: riff-24khz-16bit-mono-pcm
       Content-Type: application/ssml+xml
       Host: westus.tts.speech.microsoft.com
       Content-Length: <Length>
       Authorization: Bearer [Base64 access_token]
       User-Agent: <Your application name>


       <speak version='1.0' xml:lang='en-US'><voice xml:lang='en-US' xml:gender='Male'
           name='en-US-ChristopherNeural'>
               Microsoft Speech Service Text-to-Speech API
       </voice></speak>




       https://eastus.api.cognitive.microsoft.com/sts/v1.0/issueToken
       */



       if(voiceParams == null)
       {
           voiceParams = new VoiceParams();
       }
       string submitText = input.Replace("&", "and").Replace("!", ".").Replace("<", "less than").Replace(">", "greather than");
       string url = "https://eastus.tts.speech.microsoft.com/cognitiveservices/v1";
       string xml = "<speak version='1.0' xml:lang='en-NZ'>"+
           voices[(voiceParams == null ? AudioVoice.MollyNeural : voiceParams.voice)] +
           "<prosody rate='" +
           (voiceParams == null ? "20%" : voiceParams.speed) +
           "' pitch='" +
           (voiceParams == null ? "-5%" : voiceParams.pitch) +
           "'>"+
           submitText+
           "</prosody></voice></speak>";
      
       using(UnityWebRequest request = new UnityWebRequest(url))
       {
           request.uploadHandler = new UploadHandlerRaw(System.Text.Encoding.UTF8.GetBytes(xml));
           request.downloadHandler = new DownloadHandlerBuffer();
           request.method = UnityWebRequest.kHttpVerbPOST;
           request.SetRequestHeader("X-Microsoft-OutputFormat", "riff-24khz-16bit-mono-pcm");
           request.SetRequestHeader("Content-Type", "application/ssml+xml");
           request.SetRequestHeader("Authorization", "Bearer " + token);
           request.SetRequestHeader("User-Agent", "dgeisertTTS");
           request.disposeUploadHandlerOnDispose = true;
           request.disposeDownloadHandlerOnDispose = true;
           yield return request.SendWebRequest();


           if (request.result != UnityWebRequest.Result.Success)
           {
               Debug.Log(request.error);
           }
           else
           {
               byte[] buffer = request.downloadHandler.data;
               if(callbackWav != null)
               {
                   callbackWav.Invoke(buffer);
               }
               if(callback != null)
               {
                   AudioClip clip = AudioClip.Create(name, buffer.Length, 1, 24000, false);
                   int bytesPerSample = 2; // e.g. 2 bytes per sample (16 bit sound mono)
                   int sampleCount = buffer.Length / bytesPerSample;


                   // Allocate memory (supporting left channel only)
                   float[] unityData = new float[sampleCount];


                   int pos = 0;
                   // Write to double array/s:
                   int i = 0;
                   while (pos < buffer.Length)
                   {
                       unityData[i] = BytesToFloat(buffer[pos], buffer[pos + 1]);
                       pos += 2;
                       i++;
                   }
                   clip.SetData(unityData, 0);
                   callback.Invoke(clip);
               }
           }
       }
   }
#endregion
#region Util


   private static float BytesToFloat(byte firstByte, byte secondByte)
   {
       short s = (short)((secondByte << 8) | firstByte);
       return s / 32768.0F;
   }
   public static string cleanForJSON(string s)
   {
       if (s == null || s.Length == 0) {
           return "";
       }


       char         c = '\0';
       int          i;
       int          len = s.Length;
       StringBuilder sb = new StringBuilder(len + 4);
       String       t;


       for (i = 0; i < len; i += 1) {
           c = s[i];
           switch (c) {
               case '\\':
               case '"':
                   sb.Append('\\');
                   sb.Append(c);
                   break;
               case '/':
                   sb.Append('\\');
                   sb.Append(c);
                   break;
               case '\b':
                   sb.Append("\\b");
                   break;
               case '\t':
                   sb.Append("\\t");
                   break;
               case '\n':
                   sb.Append("\\n");
                   break;
               case '\f':
                   sb.Append("\\f");
                   break;
               case '\r':
                   sb.Append("\\r");
                   break;
               default:
                   if (c < ' ') {
                       t = "000" + String.Format("X", c);
                       sb.Append("\\u" + t.Substring(t.Length - 4));
                   } else {
                       sb.Append(c);
                   }
                   break;
           }
       }
       return sb.ToString();
   }
   #endregion
}

Other Guides on Stable Diffusion

How To Set Up ControlNet Models in Stable Diffusion

How to Train a Custom Embedding in Stable Diffusion Tutorial