diff --git a/README b/README index 972d3c3..2f336a1 100644 --- a/README +++ b/README @@ -2,16 +2,18 @@ This extern still needs a lot of work still: +-output for text +-debug the 'auto' mode -variable models (just some functionality code) -better controls -general code cleanup but it does work. -This project is an old one that I recently blew the dust off of. I quit working on it 3 years ago because it turned out that pocketsphinx has a function call collision with pd's gui engine on linux. This causes (on linux) pd to crash anytime you try to decode. Back then I couldn't figure out how to resolve this issue without rebbuilding either sphinx or pd to avoid the collision. Clearly this is not an elegant solution if I want people to use this software. Now, I know more about libraries and dynamic loading, so I rewrote the extern to use the dynamic libpocketsphinx rather than the static one. Thus, in essence, the heart of [recog~] (recog_tilde_decode) is simply a wrapper for libpocketsphinx.so. However, since this older code, the style is pretty bad. A word of warning to anyone that wants to pick this up. I will continue to rewrite and improve the functionality as time allows. +This project is an old one that I recently blew the dust off of. I quit working on it 3 years ago because it turned out that pocketsphinx has a function call collision with pd's gui engine on linux. This causes (on linux) pd to crash anytime you try to decode. Back then I couldn't figure out how to resolve this issue without rebuilding either sphinx or pd with altered function names to avoid the collision. Clearly this is not an elegant solution. Now, I know more about libraries and dynamic loading, so I rewrote the extern to use the dynamic libpocketsphinx rather than the static one. Thus, in essence, the heart of [recog~] (recog_tilde_decode) is simply a wrapper for libpocketsphinx.so. This is older code so the style is kinda bad. I will continue to rewrite and improve the functionality as time allows. -Other things to think about are designing a pd patch to automate start/decode calls and building a sister extern that can train a custom model (HARD!). +Other things to think about are designing a pd patch to automate start/decode calls and building a sister extern that can be used to train a custom model. sphinx ------ @@ -26,7 +28,7 @@ and the same with pocketsphinx. If you are unfamiliar, you might want to read the 'learn' pages from the above url anyway to get acquainted. -Thanks Carnegie Melon University for this incredible software! +Thanks Carnegie Melon University for this wonderful software! building: diff --git a/README~ b/README~ index 225b685..2f336a1 100644 --- a/README~ +++ b/README~ @@ -1,13 +1,19 @@ -[recog~] is a speech recognition extern built around CMU's sphinx library (see license below). This extern needs a lot of work still: +[recog~] is a speech recognition extern built around CMU's sphinx library (see license below). --variable models --better controls +This extern still needs a lot of work still: + +-output for text +-debug the 'auto' mode +-variable models (just some functionality code) +-better controls -general code cleanup -but it does work. This project is an old one recently refurbished. I quit 3 years ago because turned out that pocketsphinx has a function call collision with pd's gui engine on linux. This causes (on linux) pd to crash anytime you try to decode. The solution was to load the library as a dynamic rather than static library (hence the indirect function calls). So in essence, the heart of [recog~] (recog_tilde_decode) is simply a wrapper for libpocketsphinx.so. However, since this older code, the style is pretty bad. A word of warning to anyone that wants to pick this up. +but it does work. + +This project is an old one that I recently blew the dust off of. I quit working on it 3 years ago because it turned out that pocketsphinx has a function call collision with pd's gui engine on linux. This causes (on linux) pd to crash anytime you try to decode. Back then I couldn't figure out how to resolve this issue without rebuilding either sphinx or pd with altered function names to avoid the collision. Clearly this is not an elegant solution. Now, I know more about libraries and dynamic loading, so I rewrote the extern to use the dynamic libpocketsphinx rather than the static one. Thus, in essence, the heart of [recog~] (recog_tilde_decode) is simply a wrapper for libpocketsphinx.so. This is older code so the style is kinda bad. I will continue to rewrite and improve the functionality as time allows. -Other things to think about are designing a pd patch to automate start/decode calls and building a sister extern that can train a custom model (HARD!). +Other things to think about are designing a pd patch to automate start/decode calls and building a sister extern that can be used to train a custom model. sphinx ------ @@ -20,9 +26,9 @@ On my computer, after building and installing the libraries, I had to move the s and the same with pocketsphinx. -You might want to read the 'learn' pages from the above url anyway to get acquainted with sphinx and speech recognition if you haven't already. +If you are unfamiliar, you might want to read the 'learn' pages from the above url anyway to get acquainted. -Thanks Carnegie Melon University for this incredible software! +Thanks Carnegie Melon University for this wonderful software! building: diff --git a/recog~-help.pd b/recog~-help.pd index fdaac31..fc372dd 100755 --- a/recog~-help.pd +++ b/recog~-help.pd @@ -1,6 +1,6 @@ -#N canvas 702 151 657 513 10; -#X obj 71 218 recog~; -#N canvas 1 244 697 514 arrays 1; +#N canvas 700 149 657 513 10; +#X obj 126 196 recog~; +#N canvas 1 242 697 514 arrays 1; #N canvas 0 50 450 300 (subpatch) 0; #X array sound 133740 float 2; #X coords 0 1 133740 -1 200 140 1; diff --git a/recog~.c b/recog~.c index a110beb..57e2ad5 100755 --- a/recog~.c +++ b/recog~.c @@ -67,11 +67,17 @@ typedef ps_decoder_t *(*fn_ps_init)(cmd_ln_t *config); typedef int (*fn_ps_start_utt)(ps_decoder_t *ps, char const *uttid); -typedef int (*fn_ps_process_raw)(ps_decoder_t *ps, int16 const *data, size_t n_samples, int no_search, int full_utt); +typedef int (*fn_ps_process_raw)(ps_decoder_t *ps, + int16 const *data, + size_t n_samples, + int no_search, + int full_utt); typedef int (*fn_ps_end_utt)(ps_decoder_t *ps); -typedef char const *(*fn_ps_get_hyp)(ps_decoder_t *ps, int32 *out_best_score, char const **out_uttid); +typedef char const *(*fn_ps_get_hyp)(ps_decoder_t *ps, + int32 *out_best_score, + char const **out_uttid); typedef int(*fn_ps_free)(ps_decoder_t *ps); @@ -88,8 +94,6 @@ typedef struct _dl_sphinx{ fn_ps_get_hyp r_ps_get_hyp; fn_ps_free r_ps_free; - - int32 score; }t_dl_sphinx; @@ -129,6 +133,7 @@ typedef struct _recog_tilde{ t_int sampleRate; t_histogram *x_histo; + //a histogram is associated with the charBuff //used to compare strings and (hopefully) report correct //hyps @@ -144,12 +149,10 @@ typedef struct _recog_tilde{ int writeout, writeread; }t_recog_tilde; - //structs from sphinx don't put these in the object struct?? - t_symbol *hmm; t_symbol *lm; t_symbol *dict; - +t_symbol *lib_dir; //---------------------------------- @@ -161,9 +164,12 @@ static void output(t_recog_tilde *x); //---------------------------------- //---------------------------------- + static float spline_interpolate(t_float *buffer, t_int bufferLength, t_float findex) //this routine is from musicDSP.org -//I literally copied someone else's code (works great) +//I literally copied someone else's code +//this may be overkill in terms of downsampling, but +//it is accurate { @@ -184,7 +190,6 @@ static float spline_interpolate(t_float *buffer, t_int bufferLength, t_float fin + fr *((p3-p2)*50.0+(p1-p4)*25.0+(p5-p0)*5.0))))); } - static void block(t_recog_tilde *x, int n) { int i, j; @@ -261,10 +266,10 @@ static void down_sample(t_recog_tilde *x, t_int n) t_float conv, oneOverConv, f; t_int newsampnum; + x->sampleRate = sys_getsr(); conv = (t_float)x->sampleRate/16000.0; f = (n/conv); - newsampnum = (t_int)f; //routine to keep hold of the right number of samples @@ -370,7 +375,7 @@ static void recog_tilde_init(t_recog_tilde *x) //x->y.r_cl_retain(x->ps); if (x->ps == NULL) pd_error(x, "configuration error!"); - else post("%d", sizeof(x->ps)); + else post("pocketsphinx configured correctly"); } @@ -385,28 +390,6 @@ static void recog_tilde_decode(t_recog_tilde *x) char buf[256]; int hyplen; - /* if (fp_ps == NULL) { */ - /* perror("Failed to open goforward.raw"); */ - /* return 1; */ - /* } */ - - /* fseek(fp_ps, 0, SEEK_SET); */ - /* rv = ps_start_utt(ps, "goforward"); */ - /* if (rv < 0) */ - /* return 1; */ - /* while (!feof(fp_ps)) { */ - /* size_t nsamp; */ - /* nsamp = fread(buf, 2, 512, fp_ps); */ - /* rv = ps_process_raw(ps, buf, nsamp, FALSE, FALSE); */ - /* } */ - /* rv = ps_end_utt(ps); */ - /* if (rv < 0) */ - /* return 1; */ - /* hyp = ps_get_hyp(ps, &score, &uttid); */ - /* if (hyp == NULL) */ - /* return 1; */ - /* printf("Recognized: %s\n", hyp); */ - rv = x->y.r_ps_start_utt(x->ps, NULL); //post("this should be 0: %d", rv); @@ -414,7 +397,7 @@ static void recog_tilde_decode(t_recog_tilde *x) rv = x->y.r_ps_process_raw(x->ps, x->decodeAutoBuff, x->autoBuffSize, FALSE, FALSE); else rv = x->y.r_ps_process_raw(x->ps, x->decodeBuff, x->decodeWritePoint, FALSE, FALSE); - post("this should be the number of frames: %d", rv); + //post("this should be the number of frames: %d", rv); rv = x->y.r_ps_end_utt(x->ps); //post("this should also be 0: %d", rv); @@ -467,10 +450,6 @@ static void output(t_recog_tilde *x) freebytes(outv, outc * sizeof(t_atom)); } - -//begin the decoding cycle--this methodology -//will change once the current system is functioning -//perhaps there will be an 'auto' mode... void recog_tilde_output(t_recog_tilde *x) { x->writeout = 1; @@ -494,18 +473,15 @@ void recog_tilde_start(t_recog_tilde *x) void recog_tilde_stop(t_recog_tilde *x) { - - if(x->decodeGo == 1) { + x->decodeGo = 0; x->automode = 0; recog_tilde_decode(x); - } - } void recog_tilde_reinit(t_recog_tilde *x) @@ -587,7 +563,6 @@ t_int *recog_tilde_perform(t_int *w) int i; - for(i=0; iinBuff[x->ioWritePoint++] = in[i]; @@ -622,11 +597,14 @@ t_int *recog_tilde_perform(t_int *w) static void recog_tilde_dsp(t_recog_tilde *x, t_signal **sp) { + dsp_add(recog_tilde_perform, 4, x, sp[0]->s_vec, sp[1]->s_vec, sp[0]->s_n); + } static void *recog_tilde_new(int argc, t_atom *argv) { + t_recog_tilde *x = (t_recog_tilde *)pd_new(recog_tilde_class); outlet_new(&x->x_obj, &s_signal); @@ -666,8 +644,8 @@ static void *recog_tilde_new(int argc, t_atom *argv) //sphinx setup stuff: //recodsider this whole thing: hmm=gensym("./model/hmm/en_US/hub4wsj_sc_8k"); - dict=gensym("./model/lm/en/turtle.dic"); - lm=gensym("./model/lm/en/turtle.DMP"); + dict=gensym("./model/lm/en_US/hub4.5000.dic"); + lm=gensym("./model/lm/en_US/hub4.5000.DMP"); // x->inBuff = (t_float *)t_getbytes(0); @@ -706,7 +684,6 @@ static void *recog_tilde_new(int argc, t_atom *argv) x->x_histo = (t_histogram *)t_getbytes(0); x->x_histo = (t_histogram *)t_resizebytes(x->x_histo, 0, sizeof(t_int) * 50); - post("size of char ptr : %d", sizeof(char *)); recog_tilde_init(x); return (void*)x; diff --git a/recog~.o b/recog~.o index cdbae58..40571f5 100644 Binary files a/recog~.o and b/recog~.o differ diff --git a/recog~.pd_linux b/recog~.pd_linux index 9561a45..3ed4542 100755 Binary files a/recog~.pd_linux and b/recog~.pd_linux differ