In this tutorial, you'll learn how to build a real-time speech app using the Livestack framework. This app will record speech, transcribe it into text, translate the text into a selected language, and provide a summarized title periodically.
Before you begin, make sure you have the following installed:
If you prefer to understand each step in detail, follow the instructions below to create the example app from scratch.
First, we need to create a new React project. The command below will help you create a project scaffold and install the necessary dependencies.
npx create-livestack my-livestack-app --template typescript-speech-app
cd my-livestack-app
We'll start by setting up the client side of our application.
Create a file for shared constants.
src/common/defs.ts
:
export const SPEECH_LIVEFLOW_NAME = "<your_speech_liveflow_name>";
SpeechComponents.tsx
This component will handle the recording and display of transcriptions.
src/client/SpeechComponents.tsx
:
"use client";
import React from "react";
import {
usePCMRecorder,
encodeToB64,
rawPCMInput,
speechChunkToTextOutput,
} from "@livestack/transcribe/client";
import { useJobBinding, useOutput, useInput } from "@livestack/client";
import { SPEECH_LIVEFLOW_NAME } from "../common/defs";
import { translationOutputSchema } from "@livestack/lab-internal-common";
import { FaStop, FaMicrophone } from "react-icons/fa";
import { z } from "zod";
import prettyBytes from "pretty-bytes";
export const SpeechComponents: React.FC = () => {
const job = useJobBinding({
specName: SPEECH_LIVEFLOW_NAME,
});
const { feed } = useInput({
tag: "input-default",
def: rawPCMInput,
job,
});
const translation = useOutput({
tag: "translation",
def: translationOutputSchema,
job,
query: { type: "lastN", n: 10 },
});
const { last: summarizedTitle } = useOutput({
tag: "summarized-title",
def: z.object({
summarizedTitle: z.string(),
}),
job,
});
const transcription = useOutput({
tag: "transcription",
def: speechChunkToTextOutput,
job,
query: { type: "lastN", n: 10 },
});
const [volume, setVolume] = React.useState<number>(0);
const { startRecording, stopRecording, isRecording, cumulativeDataSent } =
usePCMRecorder({
onDataAvailable: async (data) => {
const encoded = encodeToB64(data);
if (feed) {
await feed({ rawPCM64Str: encoded });
console.log(encoded.slice(0, 10), "length: ", encoded.length);
}
},
onVolumeChange: (volume) => {
setVolume(volume);
},
});
const handleRecording = isRecording ? stopRecording : startRecording;
return (
<div className="m-4 grid grid-cols-5 gap-2 divide-x">
<div>
<h2 className="text-red-800">1. Click on "Start Recording" button</h2>
<br />
{job.jobId && (
<button
className="btn w-fit rounded border border-gray-800 bg-gray-200 p-2"
onClick={handleRecording}
>
<span style={{ display: "inline-block" }}>
{isRecording ? "Stop Recording" : "Start Recording"}
</span>
<span style={{ display: "inline-block" }}>
{isRecording ? <FaStop /> : <FaMicrophone />}
</span>
</button>
)}
<div>
Volume: <span>{volume.toFixed(1)}</span>
<br />
<progress
value={volume}
max={100}
style={{ width: "100px" }}
></progress>
<br />
{typeof cumulativeDataSent !== "undefined" && (
<>Total data sent: {prettyBytes(cumulativeDataSent)}</>
)}
</div>
</div>
<div className="col-span-2">
<div className="ml-4">
<h2 className="text-green-800">
2. Speech transcripts will pop up here
</h2>
<br />
<article style={{ maxWidth: "100%" }}>
{transcription.map((transcript, i) => (
<span key={i} className="text-sm">
{transcript.data.transcript}
</span>
))}
</article>
</div>
</div>
<div className="col-span-2">
<div className="ml-4">
<h2 className="text-blue-800">
3. Periodically, a one-liner short summary is generated
</h2>
<br />
<p>{summarizedTitle?.data.summarizedTitle}</p>
<br />
{translation && (
<div>
<h2 className="text-indigo-800">
4. Your speech translated to French
</h2>
<br />
{translation.map((t, idx) => (
<div key={idx}>{t.data.translated}</div>
))}
</div>
)}
</div>
</div>
</div>
);
};
export default SpeechComponents;
index.tsx
Create the main entry point for the React application.
src/client/index.tsx
:
import React, { Suspense } from "react";
import ReactDOM from "react-dom/client";
import SpeechComponents from "./SpeechComponents";
import "./globals.css";
const root = ReactDOM.createRoot(
document.getElementById("root") as HTMLElement
);
root.render(
<Suspense fallback={<div>Loading...</div>}>
<SpeechComponents />
</Suspense>
);
Next, we will set up the server that will handle the speech processing.
Define the workflow for processing speech input in liveflow.speech.ts
.
src/server/liveflow.speech.ts
:
import {
rawPCMToWavSpec,
speechChunkToTextSpec,
} from "@livestack/transcribe/server";
import { Liveflow, conn, expose } from "@livestack/core";
import { SPEECH_LIVEFLOW_NAME } from "../common/defs";
import { translationSpec } from "@livestack/translate-server";
import { titleSummarizerSepc } from "@livestack/summarizer/server";
import { textSplittingSpec } from "@livestack/lab-internal-server";
export const speechLiveflow = Liveflow.define({
name: SPEECH_LIVEFLOW_NAME,
connections: [
conn({
from: rawPCMToWavSpec,
transform: ({ wavb64Str }) => ({ wavb64Str, whisperType: "openai" }),
to: speechChunkToTextSpec,
}),
conn({
from: speechChunkToTextSpec,
transform: ({ transcript }) => transcript,
to: textSplittingSpec,
}),
conn({
from: textSplittingSpec,
transform: (chunkText) => ({ transcript: chunkText, llmType: "openai" }),
to: titleSummarizerSepc,
}),
conn({
from: speechChunkToTextSpec,
transform: ({ transcript }) => ({
toLang: "French",
text: transcript,
llmType: "openai",
}),
to: translationSpec,
}),
],
exposures: [
expose(rawPCMToWavSpec.input.default, "input-default"),
expose(speechChunkToTextSpec.output.default, "transcription"),
expose(titleSummarizerSepc.output.default, "summarized-title"),
expose(translationSpec.output.default, "translation"),
],
});
index.ts
Set up the main server file to initialize the environment and start the server.
src/server/index.ts
:
import { LiveEnv } from "@livestack/core";
import { getLocalTempFileStorageProvider } from "@livestack/core";
import { initJobBinding } from "@livestack/gateway";
import express from "express";
import path from "path";
import bodyParser from "body-parser";
import cors from "cors";
import ViteExpress from "vite-express";
import { speechLiveflow } from "./liveflow.speech";
const liveEnvP = LiveEnv.create({
projectId: "MY_LIVE_SPEECH_APP",
storageProvider: getLocalTempFileStorageProvider("/tmp/zzlive"),
});
async function main() {
LiveEnv.setGlobal(liveEnvP);
const app = express();
app.use(cors());
app.use(bodyParser.json());
app.use(express.static(path.join(__dirname, "..", "public")));
const PORT = 4700;
const httpServer = ViteExpress.listen(app, PORT, () => {
console.info(`Server running on http://localhost:${PORT}.`);
});
initJobBinding({
httpServer,
allowedSpecsForBinding: [speechLiveflow],
});
}
main();
Ensure your project structure matches this:
my-livestack-app/
├── node_modules/
├── public/ (optional)
├── src/
│ ├── client/
│ │ ├── index.tsx
│ │ ├── SpeechComponents.tsx
│ │ └── globals.css
│ ├── server/
│ │ ├── index.ts
│ │ └── liveflow.speech.ts
│ └── common/
│ └── defs.ts
├── package.json
├── tsconfig.json
├── index.html
├── postcss.config.cjs
├── tailwind.config.ts
├── vite.config.ts
├── .gitignore
└── .env (optional)
npm run dev
By following this tutorial, you've built a real-time speech transcription, translation and summary app using the Livestack framework. The app records speech, transcribes it, translates it into French, and periodically generates a summarized title. For a quick start, use the npx create-livestack <project-directory> --template [typescript-speech-app / typescript-backend-only / typescript-setup-only]
command, or follow the detailed steps to gain a deeper understanding of the process.