gpt4 book ai didi

android - 使用 sphinx4 进行语音识别的客户端-服务器实现

转载 作者:行者123 更新时间:2023-11-30 02:33:36 25 4
gpt4 key购买 nike

我尝试识别麦克风从 Android 设备(客户端)接收到的语音。之后,我将带有语音的DatagramPacket发送到服务器,它使用Sphinx 4实现语音识别。但是在服务器端我没有得到任何结果。我总是得到 result==null。有什么问题?

客户端

import android.media.AudioFormat;
import android.media.AudioRecord;
import android.media.MediaRecorder;


import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.net.DatagramPacket;
import java.net.DatagramSocket;
import java.net.InetAddress;
import java.net.SocketException;
import java.net.UnknownHostException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;

public class Recording {

public byte[] buffer;
public DatagramSocket socket;
private int port = 8080;
AudioRecord audioRecord;

private int sampleRate = 16000;
private int channelConfig = AudioFormat.CHANNEL_IN_MONO;
private int audioFormat = AudioFormat.ENCODING_PCM_16BIT;

private boolean status = true;

public void stopListener() {
status = false;
//audioRecord.stop();
audioRecord.release();
}

public void startListener() {
status = true;
startStreaming();
}

private void startStreaming() {

Thread streamThread = new Thread(new Runnable() {

@Override
public void run() {
try {
int minBufSize = AudioRecord.getMinBufferSize(sampleRate, channelConfig, audioFormat);
System.out.println("minBufSize = " + minBufSize);
socket = new DatagramSocket();
System.out.println("Socket Created");

byte[] buffer = new byte[minBufSize];

DatagramPacket packet;
final InetAddress destination = InetAddress.getByName("192.168.0.74");
System.out.println("Ip address recieved");

socket.connect(destination,port);
System.out.println("Socket connected");

audioRecord = new AudioRecord(MediaRecorder.AudioSource.MIC,sampleRate,channelConfig,audioFormat,minBufSize);
System.out.println("Record initialized");
audioRecord.startRecording();


while (status)
{
//reading data from MIC into buffer
minBufSize = audioRecord.read(buffer,0,buffer.length);
// System.out.println("minBufSze were read " + minBufSize);


//putting buffer in the packet
packet = new DatagramPacket(buffer,buffer.length,destination,port);

socket.send(packet);
}

}

} catch (SocketException e) {
e.printStackTrace();
} catch (UnknownHostException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}

}
});
streamThread.start();
}
}

服务器端

import edu.cmu.sphinx.tools.audio.AudioData;

import javax.sound.sampled.AudioFormat;
import javax.sound.sampled.AudioInputStream;
import java.io.*;
import java.net.DatagramPacket;
import java.net.DatagramSocket;
import edu.cmu.sphinx.api.Configuration;
import edu.cmu.sphinx.api.SpeechResult;
import edu.cmu.sphinx.api.StreamSpeechRecognizer;
import edu.cmu.sphinx.frontend.util.AudioFileDataSource;
import edu.cmu.sphinx.frontend.util.Microphone;
import edu.cmu.sphinx.frontend.util.StreamDataSource;
import edu.cmu.sphinx.recognizer.Recognizer;
import edu.cmu.sphinx.result.Lattice;
import edu.cmu.sphinx.result.LatticeOptimizer;
import edu.cmu.sphinx.result.Result;
import edu.cmu.sphinx.tools.audio.AudioData;
import edu.cmu.sphinx.util.props.ConfigurationManager;

import javax.sound.sampled.AudioInputStream;
import java.io.IOException;

public class Server {

public static void main(String[] args) throws IOException {
int port = 8080;
int sampleRate = 16000;
boolean status = true;
AudioInputStream ais;
AudioFormat format;
boolean flag = false;
Recognition recognition = new Recognition();


System.out.println("Welcome to server side.");
DatagramSocket serverSocket = new DatagramSocket(port);


format = new AudioFormat(sampleRate,16,1,true,false);
byte[] receiveData = new byte[4096];

while (status) {
DatagramPacket receivePacket = new DatagramPacket(receiveData,receiveData.length);
ByteArrayInputStream bais = new ByteArrayInputStream(receivePacket.getData());
serverSocket.receive(receivePacket);
for (int i=0; i<receiveData.length; i++)
{
out.println(receiveData[i]);
}
ais = new AudioInputStream(bais,format, receivePacket.getLength());
System.out.println("ServerSocket is " + serverSocket.isConnected());
recognition.Recognize(ais);

}
}
}


public class Recognition {

ConfigurationManager cm;
Recognizer recognizer;
Configuration configuration;
AudioInputStream audioInputStream;

public Recognition() {
configuration = new Configuration();

cm = new ConfigurationManager(Recognition.class.getResource("/src/config.xml"));
recognizer = (Recognizer) cm.lookup("recognizer");
recognizer.allocate();

}

public void Recognize(AudioInputStream ais) {


audioInputStream = ais;

AudioFileDataSource dataSource = (AudioFileDataSource) cm.lookup("audioFileDataSource");
dataSource.setInputStream(audioInputStream,"stream");

Result result = recognizer.recognize();
System.out.println("Say: (Good morning | Hello) ( Bhiksha | Evandro | Paul | Philip | Rita | Will )");
if (result != null) {
Lattice lattice = new Lattice(result);
LatticeOptimizer optimizer = new LatticeOptimizer(lattice);
optimizer.optimize();
lattice.dumpAllPaths();
String resultText = result.getBestResultNoFiller();
System.out.println("I heard: " + resultText + '\n');
} else {
System.out.println("I could't hear you!");
}
}
}

配置文件

<?xml version="1.0" encoding="UTF-8"?>

<!--
Sphinx-4 Configuration file
-->

<!-- ******************************************************** -->
<!-- biship configuration file -->
<!-- ******************************************************** -->

<config>
<!-- ******************************************************** -->
<!-- frequently tuned properties -->
<!-- ******************************************************** -->
<property name="absoluteBeamWidth" value="500"/>
<property name="relativeBeamWidth" value="1E-80"/>
<property name="absoluteWordBeamWidth" value="20"/>
<property name="relativeWordBeamWidth" value="1E-60"/>
<property name="wordInsertionProbability" value="1E-16"/>
<property name="languageWeight" value="7.0"/>
<property name="silenceInsertionProbability" value=".1"/>
<property name="frontend" value="epFrontEnd"/>
<property name="recognizer" value="recognizer"/>
<property name="showCreations" value="false"/>


<!-- ******************************************************** -->
<!-- word recognizer configuration -->
<!-- ******************************************************** -->

<component name="recognizer"
type="edu.cmu.sphinx.recognizer.Recognizer">
<property name="decoder" value="decoder"/>
<propertylist name="monitors">
<item>accuracyTracker </item>
<item>speedTracker </item>
<item>memoryTracker </item>
<item>recognizerMonitor </item>
</propertylist>
</component>

<!-- ******************************************************** -->
<!-- The Decoder configuration -->
<!-- ******************************************************** -->

<component name="decoder" type="edu.cmu.sphinx.decoder.Decoder">
<property name="searchManager" value="wordPruningSearchManager"/>
<property name="featureBlockSize" value="50"/>
</component>

<!-- ******************************************************** -->
<!-- The Search Manager -->
<!-- ******************************************************** -->

<component name="wordPruningSearchManager"
type="edu.cmu.sphinx.decoder.search.WordPruningBreadthFirstSearchManager">
<property name="logMath" value="logMath"/>
<property name="linguist" value="lexTreeLinguist"/>
<property name="pruner" value="trivialPruner"/>
<property name="scorer" value="threadedScorer"/>
<property name="activeListManager" value="activeListManager"/>
<property name="growSkipInterval" value="0"/>
<property name="checkStateOrder" value="false"/>
<property name="buildWordLattice" value="true"/>
<property name="acousticLookaheadFrames" value="1.7"/>
<property name="relativeBeamWidth" value="${relativeBeamWidth}"/>
</component>


<!-- ******************************************************** -->
<!-- The Active Lists -->
<!-- ******************************************************** -->

<component name="activeListManager"
type="edu.cmu.sphinx.decoder.search.SimpleActiveListManager">
<propertylist name="activeListFactories">
<item>standardActiveListFactory</item>
<item>wordActiveListFactory</item>
<item>wordActiveListFactory</item>
<item>standardActiveListFactory</item>
<item>standardActiveListFactory</item>
<item>standardActiveListFactory</item>
</propertylist>
</component>

<component name="standardActiveListFactory"
type="edu.cmu.sphinx.decoder.search.PartitionActiveListFactory">
<property name="logMath" value="logMath"/>
<property name="absoluteBeamWidth" value="${absoluteBeamWidth}"/>
<property name="relativeBeamWidth" value="${relativeBeamWidth}"/>
</component>

<component name="wordActiveListFactory"
type="edu.cmu.sphinx.decoder.search.PartitionActiveListFactory">
<property name="logMath" value="logMath"/>
<property name="absoluteBeamWidth" value="${absoluteWordBeamWidth}"/>
<property name="relativeBeamWidth" value="${relativeWordBeamWidth}"/>
</component>

<!-- ******************************************************** -->
<!-- The Pruner -->
<!-- ******************************************************** -->
<component name="trivialPruner"
type="edu.cmu.sphinx.decoder.pruner.SimplePruner"/>

<!-- ******************************************************** -->
<!-- TheScorer -->
<!-- ******************************************************** -->
<component name="threadedScorer"
type="edu.cmu.sphinx.decoder.scorer.ThreadedAcousticScorer">
<property name="frontend" value="${frontend}"/>
</component>

<!-- ******************************************************** -->
<!-- The linguist configuration -->
<!-- ******************************************************** -->

<component name="lexTreeLinguist"
type="edu.cmu.sphinx.linguist.lextree.LexTreeLinguist">
<property name="logMath" value="logMath"/>
<property name="acousticModel" value="wsj"/>
<property name="languageModel" value="trigramModel"/>
<property name="dictionary" value="dictionary"/>
<property name="addFillerWords" value="false"/>
<property name="fillerInsertionProbability" value="1E-10"/>
<property name="generateUnitStates" value="false"/>
<property name="wantUnigramSmear" value="true"/>
<property name="unigramSmearWeight" value="1"/>
<property name="wordInsertionProbability"
value="${wordInsertionProbability}"/>
<property name="silenceInsertionProbability"
value="${silenceInsertionProbability}"/>
<property name="languageWeight" value="${languageWeight}"/>
<property name="unitManager" value="unitManager"/>
</component>


<!-- ******************************************************** -->
<!-- The Dictionary configuration -->
<!-- ******************************************************** -->
<component name="dictionary"
type="edu.cmu.sphinx.linguist.dictionary.FastDictionary">
<property name="dictionaryPath"
value="resource:/bld/WSJ_8gau_13dCep_16k_40mel_130Hz_6800Hz/dict/cmudict.0.6d"/>
<property name="fillerPath"
value="resource:/bld/WSJ_8gau_13dCep_16k_40mel_130Hz_6800Hz/noisedict"/>
<property name="addSilEndingPronunciation" value="false"/>
<property name="wordReplacement" value="&lt;sil&gt;"/>
<property name="unitManager" value="unitManager"/>
</component>


<!-- ******************************************************** -->
<!-- The Language Model configuration -->
<!-- ******************************************************** -->
<component name="trigramModel"
type="edu.cmu.sphinx.linguist.language.ngram.large.LargeTrigramModel">
<property name="unigramWeight" value=".5"/>
<property name="maxDepth" value="3"/>
<property name="logMath" value="logMath"/>
<property name="dictionary" value="dictionary"/>
<property name="location"
value="./models/language/en-us.lm.dmp"/>
</component>


<!-- ******************************************************** -->
<!-- The acoustic model configuration -->
<!-- ******************************************************** -->
<component name="wsj"
type="edu.cmu.sphinx.linguist.acoustic.tiedstate.TiedStateAcousticModel">
<property name="loader" value="wsjLoader"/>
<property name="unitManager" value="unitManager"/>
</component>

<component name="wsjLoader" type="edu.cmu.sphinx.linguist.acoustic.tiedstate.Sphinx3Loader">
<property name="logMath" value="logMath"/>
<property name="unitManager" value="unitManager"/>
<property name="location" value="resource:/bld/WSJ_8gau_13dCep_16k_40mel_130Hz_6800Hz"/>
</component>

<!-- ******************************************************** -->
<!-- The unit manager configuration -->
<!-- ******************************************************** -->

<component name="unitManager"
type="edu.cmu.sphinx.linguist.acoustic.UnitManager"/>


<!-- ******************************************************** -->
<!-- The frontend configuration -->
<!-- ******************************************************** -->

<component name="epFrontEnd" type="edu.cmu.sphinx.frontend.FrontEnd">
<propertylist name="pipeline">
<item>audioFileDataSource </item>
<item>dataBlocker </item>
<item>speechClassifier </item>
<item>speechMarker </item>
<item>nonSpeechDataFilter </item>
<item>preemphasizer </item>
<item>windower </item>
<item>fft </item>
<item>melFilterBank </item>
<item>dct </item>
<item>liveCMN </item>
<item>featureExtraction </item>
</propertylist>
</component>

<component name="audioFileDataSource" type="edu.cmu.sphinx.frontend.util.AudioFileDataSource"/>


<component name="microphone"
type="edu.cmu.sphinx.frontend.util.Microphone">
<property name="closeBetweenUtterances" value="false"/>
</component>

<component name="dataBlocker" type="edu.cmu.sphinx.frontend.DataBlocker"/>

<component name="speechClassifier"
type="edu.cmu.sphinx.frontend.endpoint.SpeechClassifier">
<property name="threshold" value="13"/>
</component>

<component name="nonSpeechDataFilter"
type="edu.cmu.sphinx.frontend.endpoint.NonSpeechDataFilter"/>

<component name="speechMarker"
type="edu.cmu.sphinx.frontend.endpoint.SpeechMarker">
<property name="speechTrailer" value="50"/>
</component>

<component name="preemphasizer"
type="edu.cmu.sphinx.frontend.filter.Preemphasizer"/>

<component name="windower"
type="edu.cmu.sphinx.frontend.window.RaisedCosineWindower"/>

<component name="fft"
type="edu.cmu.sphinx.frontend.transform.DiscreteFourierTransform"/>

<component name="melFilterBank"
type="edu.cmu.sphinx.frontend.frequencywarp.MelFrequencyFilterBank"/>

<component name="dct"
type="edu.cmu.sphinx.frontend.transform.DiscreteCosineTransform"/>

<component name="liveCMN"
type="edu.cmu.sphinx.frontend.feature.LiveCMN"/>

<component name="featureExtraction"
type="edu.cmu.sphinx.frontend.feature.DeltasFeatureExtractor"/>

<!-- ******************************************************* -->
<!-- monitors -->
<!-- ******************************************************* -->

<component name="accuracyTracker"
type="edu.cmu.sphinx.instrumentation.BestPathAccuracyTracker">
<property name="recognizer" value="${recognizer}"/>
<property name="showRawResults" value="false"/>
<property name="showAlignedResults" value="false"/>
</component>

<component name="memoryTracker"
type="edu.cmu.sphinx.instrumentation.MemoryTracker">
<property name="recognizer" value="${recognizer}"/>
<property name="showDetails" value="false"/>
<property name="showSummary" value="false"/>
</component>

<component name="speedTracker"
type="edu.cmu.sphinx.instrumentation.SpeedTracker">
<property name="recognizer" value="${recognizer}"/>
<property name="frontend" value="${frontend}"/>
<property name="showDetails" value="false"/>
</component>

<component name="recognizerMonitor"
type="edu.cmu.sphinx.instrumentation.RecognizerMonitor">
<property name="recognizer" value="${recognizer}"/>
<propertylist name="allocatedMonitors">
<item>configMonitor </item>
</propertylist>
</component>

<component name="configMonitor"
type="edu.cmu.sphinx.instrumentation.ConfigMonitor">
<property name="showConfig" value="false"/>
</component>


<!-- ******************************************************* -->
<!-- Miscellaneous components -->
<!-- ******************************************************* -->

<component name="logMath" type="edu.cmu.sphinx.util.LogMath">
<property name="logBase" value="1.0001"/>
<property name="useAddTable" value="true"/>
</component>
</config>

最佳答案

配置文件错误。正确的配置位于名为 default.config.xml 的最新 sphinx 源中。最好使用没有配置文件的高级 API,如教程中所述

http://cmusphinx.sourceforge.net/wiki/tutorialsphinx4

接收数据的代码也是错误的,您只需将带有音频数据的套接字流传递给识别器API,识别器将自行检索数据包。否则你必须实现一个特殊的 sphinx4 数据源来接收来自服务器的数据包。

从套接字流中识别原始音频:

StreamSpeechRecognizer recognizer = new StreamSpeechRecognizer(configuration);
recognizer.startRecognition(clientSocket.getInputStream()); // Here we tell recognizer to read data from socket stream
SpeechResult result = recognizer.getResult();
recognizer.stopRecognition();

关于android - 使用 sphinx4 进行语音识别的客户端-服务器实现,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/26950899/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com