From ab003afcc4b6406ccc5ac7776ce20ad71eff7ca4 Mon Sep 17 00:00:00 2001
From: Matt Palmer <palmer.s.matt@gmail.com>
Date: Mon, 22 Apr 2024 17:17:05 -0400
Subject: [PATCH] refactor(ux): update CLI, README and Makefile to improve
 usability

- Make the installation process simple, and make subsequent crates easy to add here.
- Explain why we still need to run the server using a shell script.
- Simplify the client command line interface. We didn't need the extra layer of remove, we only have one command here.
---
 Makefile             | 53 +++++++++++++++++++++++++++++++-------------
 README.md            | 51 +++++++++++++++++++++---------------------
 client/src/client.rs |  9 +++++---
 client/src/main.rs   | 43 ++++++++++++++++++++++-------------
 run.sh               | 12 +++++-----
 5 files changed, 101 insertions(+), 67 deletions(-)

diff --git a/Makefile b/Makefile
index 6a5a740..53489e9 100644
--- a/Makefile
+++ b/Makefile
@@ -1,24 +1,45 @@
-BIN_NAME = server
-BUILD_MODE = release
+BUILD_MODE:=release
+PACKAGE=server client
+BINARIES=$(addprefix target/$(BUILD_MODE)/, $(PACKAGE))
+INSTALLED_BINARIES=$(addprefix $(INSTALL_DIR)/voice-, $(PACKAGE))
 
-TARGET = $(BIN_NAME)_$(BUILD_MODE)
+all: $(BINARIES)
 
-BIN = target/$(BUILD_MODE)/$(BIN_NAME)
+.PHONY: require-bin
+require-bin:
+ifndef INSTALL_DIR
+	$(error INSTALL_DIR is not set)
+endif
 
-all: $(TARGET)
+.PHONY: install
+install: require-bin $(INSTALLED_BINARIES)
 
 .PHONY: clean
-clean:
+clean: clean-bin
 	cargo clean
-	rm -f $(TARGET)
 
-$(TARGET): $(BIN)
-	cp $(BIN) $(TARGET)
-
-$(BIN):
-	cargo build --$(BUILD_MODE) -p $(BIN_NAME)
-
-DAEMON = local.personal.transcription
-
-kick: $(TARGET)
+.PHONY: clean-bin
+clean-bin:
+	rm -f $(BINARIES)
+ifdef $(INSTALL_DIR)
+	rm -f $(INSTALLED_BINARIES)
+endif
+
+$(INSTALL_DIR)/voice-%: target/$(BUILD_MODE)/%
+	cp $< $@
+
+ifeq ($(words $(PACKAGE)),1)
+target/$(BUILD_MODE)/%:
+else
+$(BINARIES):
+endif
+ifeq ($(BUILD_MODE),release)
+	cargo build --$(BUILD_MODE) $(foreach p, $(PACKAGE),-p $p)
+else
+	cargo build $(foreach p, $(PACKAGE),-p $p)
+endif
+
+DAEMON:=local.personal.transcription
+
+kick: require-bin $(INSTALL_DIR)/voice-server
 	launchctl kickstart -k gui/$(shell id -u)/$(DAEMON)
diff --git a/README.md b/README.md
index 4f513e3..a4cfde1 100644
--- a/README.md
+++ b/README.md
@@ -6,35 +6,34 @@ Bind requests to hotkeys as you prefer. I'm running the server as a daemon with
 
 If you don't already have a `whisper.cpp`-compatible model, follow that project's [quick-start instructions](https://github.com/ggerganov/whisper.cpp#quick-start) to get one.
 
-Start the server:
-`./run.sh macbook ggml-base.en.bin /tmp/whisper.sock`
+## Quick start
 
-Start recording: `curl -X POST -H "Content-Type: application/json" -d "$body" "http://127.0.0.1:8088/voice/$1"`
+In your terminal:
 
-Example request body:
-```json
-{
-  // partial name matches OK
-  "input_device": "MacBook Pro Microphone",
+```sh
+# Build the server and client
+INSTALL_DIR=/something/on/your/PATH make install
 
-  // optional
-  "sample_rate": 44100,
-}
+# Start the server
+# (see run.sh for why running the binary directly doesn't work yet)
+./run.sh localhost:8088 $PATH_TO_MODEL
 ```
-Response: `{ "type": "ack" }`
-
-Stop recording: `curl -X POST http://localhost:8088/voice/stop`
-Example response:
-```json
-{
-  "data": {
-    "content": "And we're recording and we're doing stuff and then we're going to send a stop message.",
-    "mode": {
-      "type": "live_typing"
-    }
-  },
-  "type": "transcription"
-}
+
+In a separate shell:
+
+```sh
+# Send a start command to the server.
+#
+# Note that `-i` is optional, without it the server will use the first
+# compatible device. For example, you might pass "MacBook" if you want to use
+# your laptop's built-in mic ("MacBook Pro Microphone").
+voice-client localhost:8088 start -i $PARTIAL_INPUT_DEVICE_NAME
+```
+
+After executing this command, the server will start recording from this specified input. To get the results, send the stop command:
+
+```sh
+voice-client localhost:8088 stop
 ```
 
-Note: the modes that you get back in the output are just metadata. Your client application that talks to the server should also handle processing the transcription differently based on the mode.
+The results will be printed to stdout.
diff --git a/client/src/client.rs b/client/src/client.rs
index 1dff552..48243c9 100644
--- a/client/src/client.rs
+++ b/client/src/client.rs
@@ -6,7 +6,7 @@ pub enum Error {
     Client(#[from] api::Error),
 }
 
-#[derive(Debug, clap::Subcommand)]
+#[derive(Debug, Clone, clap::Subcommand)]
 pub enum Commands {
     Start {
         #[clap(short, long)]
@@ -23,7 +23,7 @@ pub enum Commands {
     },
 }
 
-#[derive(Debug, clap::Args)]
+#[derive(Debug, clap::Parser)]
 pub struct App {
     #[command(subcommand)]
     pub command: Commands,
@@ -59,7 +59,7 @@ impl RunningApp {
     }
 }
 
-mod api {
+pub mod api {
     use serde::de::DeserializeOwned;
 
     use voice::{
@@ -74,6 +74,9 @@ mod api {
 
         #[error("JSON error: {0}")]
         Json(#[from] serde_json::Error),
+
+        #[error("Unexpected response: {0}")]
+        UnexpectedResponse(Response),
     }
 
     pub struct Client {
diff --git a/client/src/main.rs b/client/src/main.rs
index 350552d..31fd799 100644
--- a/client/src/main.rs
+++ b/client/src/main.rs
@@ -1,27 +1,38 @@
 mod client;
 
 use clap::Parser;
-use client::RunningApp;
-
-#[derive(Debug, clap::Parser)]
-#[command(version, about, long_about = None)]
-struct App {
-    #[command(subcommand)]
-    command: Commands,
-}
-
-#[derive(Debug, clap::Subcommand)]
-enum Commands {
-    Client(client::App),
-}
+use client::{App, Commands, RunningApp};
+use voice::app::response::Response;
 
 #[tokio::main]
 async fn main() -> Result<(), client::Error> {
     env_logger::init();
 
-    match App::parse().command {
-        Commands::Client(client_command) => {
-            let resp = RunningApp::from(client_command).execute().await?;
+    let app = App::parse();
+    match &app.command {
+        Commands::Start { .. } => {
+            match RunningApp::from(app).execute().await? {
+                Response::Ack(_) => (),
+                r => return Err(client::api::Error::UnexpectedResponse(r).into()),
+            }
+            Ok(())
+        }
+        Commands::Stop => {
+            match RunningApp::from(app).execute().await? {
+                Response::Transcription { content, .. } => {
+                    let Some(content) = content else {
+                        eprintln!("No transcription available");
+                        return Ok(());
+                    };
+                    println!("{content}");
+                }
+                r => return Err(client::api::Error::UnexpectedResponse(r).into()),
+            }
+            Ok(())
+        }
+
+        _ => {
+            let resp = RunningApp::from(app).execute().await?;
             log::info!("{:?}", resp);
             Ok(())
         }
diff --git a/run.sh b/run.sh
index b51fe34..0d6dbfb 100755
--- a/run.sh
+++ b/run.sh
@@ -12,15 +12,15 @@ fi
 
 # Additionally, whisper-rs doesn't capture Metal-specific logs for some reason.
 # So you'll still see those in the log output, even if you're suppressing.
-export RUST_LOG=whisper_sys_log=error,voice=debug
+export RUST_LOG=whisper_sys_log=error,voice=debug,server=info
+
+# The address on which the HTTP server should listen (e.g. localhost:PORT)
+addr="$1"
 
 # See the whisper.cpp repo for details on how to get a model. I recommend using
 # base or small for best results.
 model_path="$2"
 
-# The address on which the HTTP server should listen (e.g. localhost:PORT)
-addr="$3"
-
-./voice run-daemon \
-    --serve       "$addr"
+voice-server run-daemon \
+    --serve       "$addr" \
     --model       "$model_path"