From b3454792b10e0f9f160f608107c6e39fc871b7e5 Mon Sep 17 00:00:00 2001 From: Traun Leyden Date: Wed, 22 Aug 2007 14:23:28 +0000 Subject: [PATCH] speech recognition demo in python git-svn-id: http://svn.freeswitch.org/svn/freeswitch/trunk@5659 d0543943-73ff-0310-b7d9-9358b9ac24b2 --- scripts/py_modules/__init__.py | 1 + scripts/py_modules/speechtools.py | 198 ++++++++++++++++++++++++++++++ scripts/recipewizard.py | 87 +++++++++++++ 3 files changed, 286 insertions(+) create mode 100644 scripts/py_modules/__init__.py create mode 100644 scripts/py_modules/speechtools.py create mode 100644 scripts/recipewizard.py diff --git a/scripts/py_modules/__init__.py b/scripts/py_modules/__init__.py new file mode 100644 index 0000000000..1bb8bf6d7f --- /dev/null +++ b/scripts/py_modules/__init__.py @@ -0,0 +1 @@ +# empty diff --git a/scripts/py_modules/speechtools.py b/scripts/py_modules/speechtools.py new file mode 100644 index 0000000000..88e030db9b --- /dev/null +++ b/scripts/py_modules/speechtools.py @@ -0,0 +1,198 @@ +from freeswitch import * +from xml.dom import minidom + +VOICE_ENGINE = "cepstral" +VOICE = "William" + +""" +A few classes that make it easier to write speech applications +using Python. It is roughly modelled after the equivalent that +is written in JavaScript. + +Status: should work, but not yet complete. some pending items +are mentioned in comments +""" + +class Grammar: + def __init__(self, name, path, obj_path, + min_score=1, confirm_score=400, halt=False): + """ + @param name - name of grammar to reference it later + @param path - path to xml grammar file + @param obj_path - xml path to find interpretation from root + in result xml, eg, 'interpretation' + @param min_score - score threshold to accept result + @param confirm_score - if score below this threshold, ask user + if they are sure this is correct + @param halt - not sure what was used for in js, currently unused + """ + self.name=name + self.path=path + self.obj_path=obj_path + self.min_score=min_score + self.confirm_score=confirm_score + self.halt=halt + + +class SpeechDetect: + + def __init__(self, session, module_name, ip_addr): + self.session=session + self.module_name=module_name + self.ip_addr=ip_addr + self.grammars = {} + + def addGrammar(self, grammar): + self.grammars[grammar.name]=grammar + + def setGrammar(self, name): + self.grammar = self.grammars[name] + + def detectSpeech(self): + # TODO: we might not always want to call detect_speech + # with this cmd, see js version for other options + # also see detect_speech_function() in mod_dptools.c + cmd = "%s %s %s %s" % (self.module_name, + self.grammar.name, + self.grammar.path, + self.ip_addr) + console_log("debug", "calling detect_speech with: %s\n" % cmd) + self.session.execute("detect_speech", cmd) + console_log("debug", "finished calling detect_speech\n") + +class SpeechObtainer: + + def __init__(self, speech_detect, required_phrases, wait_time, max_tries): + """ + @param speech_detect - the speech detect object, which holds a + reference to underlying session and can + be re-used by many SpeechObtainers + @param required_phrases - the number of required phrases from the + grammar. for example if its prompting for + the toppings on a sandwhich and min toppings + is 3, use 3. normally will be 1. + @param wait_time - the time, in millisconds, to wait for + input during each loop iteration + @param max_tries - this number multiplied by wait time gives the + 'total wait time' before we give up and return + partial or no result + """ + self.speech_detect=speech_detect + self.required_phrases=required_phrases + self.wait_time=wait_time + self.max_tries=max_tries + + self.detected_phrases = [] + + def setGrammar(self, grammar): + """ + @param grammar - instance of grammar class + """ + self.grammar=grammar + self.speech_detect.addGrammar(grammar) + self.speech_detect.setGrammar(self.grammar.name) + + def detectSpeech(self): + self.speech_detect.detectSpeech() + + def run(self): + """ + start speech detection with the current grammar, + and listen for results from asr engine. once a result + has been returned, return it to caller + """ + + def dtmf_handler(input, itype, funcargs): + console_log("INFO","\n\nDTMF itype: %s\n" % itype) + if itype == 1: # TODO!! use names for comparison instead of number + return self.handle_event(input, funcargs) + elif itype== 0: + console_log("INFO","\n\nDTMF input: %s\n" % input) + else: + console_log("INFO","\n\nUnknown input type: %s\n" % itype) + return None + + + num_tries = 0 + + session = self.speech_detect.session + + console_log("debug", "setting dtmf callback\n") + session.setDTMFCallback(dtmf_handler, "") + console_log("debug", "calling getDigits\n") + + console_log("debug", "starting run() while loop\n") + while (session.ready() and + num_tries < self.max_tries and + len(self.detected_phrases) < self.required_phrases): + console_log("debug", "top of run() while loop\n") + session.collectDigits(self.wait_time) + num_tries += 1 + + console_log("debug", "while loop finished\n") + return self.detected_phrases + + def handle_event(self, event, funcargs): + """ + when the dtmf handler receives an event, it calls back + this method. event is a dictionary with subdictionaries .. + + Example 1 + ========= + + {'body': None, 'headers': {'Speech-Type': 'begin-speaking'}} + + Example 2 + ========= + {'body': ' + waffles', + 'headers': {'Speech-Type': 'detected-speech'}} + + This dictionary is constructed in run_dtmf_callback() in + freeswitch_python.cpp + + """ + + # what kind of event? + headers = event['headers'] + speech_type = headers['Speech-Type'] + if speech_type == "begin-speaking": + # not sure what to do with this, try returning "stop" + # so that it might stop playing a sound file once + # speech has been detected + return "stop" + elif speech_type == "detected-speech": + # extract the detected phrase. from result + # BUG: this assumes only ONE interpretation in the xml + # result. rest will get igored + # NOTE: have to wrap everything with str() (at least + # calls to console_log because otherwise it chokes on + # unicode strings. + # TODO: check the score + body = event['body'] + dom = minidom.parseString(body) + phrase = dom.getElementsByTagName(self.grammar.obj_path)[0] + phrase_text = self.getText(phrase) + if phrase_text: + self.detected_phrases.append(str(phrase_text)) + # do we want to return stop? what should we return? + return "stop" + else: + raise Exception("Unknown speech event: %s" % speech_type) + + + def getText(self, elt): + + """ given an element, get its text. if there is more than + one text node child, just append all the text together. + """ + + result = "" + children = elt.childNodes + for child in children: + if child.nodeType == child.TEXT_NODE: + result += str(child.nodeValue) + return result + diff --git a/scripts/recipewizard.py b/scripts/recipewizard.py new file mode 100644 index 0000000000..797a381f3f --- /dev/null +++ b/scripts/recipewizard.py @@ -0,0 +1,87 @@ +from freeswitch import * +from py_modules.speechtools import Grammar, SpeechDetect +from py_modules.speechtools import SpeechObtainer + +import time, os + +VOICE_ENGINE = "cepstral" +VOICE = "William" +GRAMMAR_ROOT = "/usr/src/freeswitch_trunk/scripts" + +""" +Example speech recognition application in python. + +How to make this work: + +* Get mod_openmrcp working along with an MRCP asr server +* Add /usr/src/freeswitch/scripts or equivalent to your PYTHONPATH +* Restart freeswitch +* Create $GRAMMAR_ROOT/mainmenu.xml from contents in mainmenu() comments + +""" + +class RecipeWizard: + + def __init__(self, session): + self.session=session + self.session.set_tts_parms(VOICE_ENGINE, VOICE) + self.main() + + def main(self): + + console_log("debug", "recipe wizard main()\n") + self.speechdetect = SpeechDetect(self.session, "openmrcp", "127.0.0.1"); + self.speechobtainer = SpeechObtainer(speech_detect=self.speechdetect, + required_phrases=1, + wait_time=5000, + max_tries=3) + gfile = os.path.join(GRAMMAR_ROOT, "mainmenu.xml") + self.grammar = Grammar("mainmenu", gfile,"input",80,90) + self.speechobtainer.setGrammar(self.grammar); + console_log("debug", "calling speechobtainer.run()\n") + self.speechobtainer.detectSpeech() + self.session.speak("Hello. Welcome to the recipe wizard. Drinks or food?") + result = self.speechobtainer.run() + console_log("debug", "speechobtainer.run() result: %s\n" % result) + if result: + self.session.speak("Received result. Result is: %s" % result[0]) + else: + self.session.speak("Sorry, I did not hear you") + + console_log("debug", "speechobtainer.run() finished\n") + +def mainmenu(): + """ + + + + + + + + + + drinks + food + + + + + + + + """ + pass + +def handler(uuid): + session = PySession(uuid) + session.answer() + rw = RecipeWizard(session) + session.hangup("1") + +