gpt4 book ai didi

google-cloud-platform - Google Cloud Speech-to-text 非常不准确,最后一个结果包含所有其他结果,并且仅在最后一个结果中包含 speakerTag

转载 作者:行者123 更新时间:2023-12-04 09:30:08 27 4
gpt4 key购买 nike

我正在使用命令行使用谷歌语音到文本并得到奇怪的结果
这是我的命令

gcloud beta ml speech recognize-long-running gs://my_bucket_name/call0.mp3 
--language-code=en-US --async --include-word-time-offsets --enable-speaker-diarization
--diarization-speaker-count=2
这是音频文件:
https://dcs.megaphone.fm/LIT9020259030.mp3?key=4b567156fd7bdfaa90992664d4bc667c
问题是:
  • 结果非常非常糟糕且不准确
  • 最后一个结果包含所有其他结果的组合
  • SpeakerTag 仅出现在最后一个结果中
  • 我只为扬声器 1
  • 获得了扬声器标签

    这是结果json:
    {
    "done": true,
    "metadata": {
    "@type": "type.googleapis.com/google.cloud.speech.v1p1beta1.LongRunningRecognizeMetadata",
    "lastUpdateTime": "2020-07-13T18:56:33.689140Z",
    "progressPercent": 100,
    "startTime": "2020-07-13T18:27:45.757871Z",
    "uri": "gs://deepagent-db032.appspot.com/conmagi/call1.mp3"
    },
    "name": "398565854464473919",
    "response": {
    "@type": "type.googleapis.com/google.cloud.speech.v1p1beta1.LongRunningRecognizeResponse",
    "results": [
    {
    "alternatives": [
    {
    "confidence": 0.87135065,
    "transcript": "love",
    "words": [
    {
    "endTime": "11.300s",
    "startTime": "10.400s",
    "word": "love"
    }
    ]
    }
    ],
    "languageCode": "en-us"
    },
    {
    "alternatives": [
    {
    "confidence": 0.48216835,
    "transcript": "you are",
    "words": [
    {
    "endTime": "425.100s",
    "startTime": "424.500s",
    "word": "you"
    },
    {
    "endTime": "425.400s",
    "startTime": "425.100s",
    "word": "are"
    }
    ]
    }
    ],
    "languageCode": "en-us"
    },
    {
    "alternatives": [
    {
    "confidence": 0.9194219,
    "transcript": "how far is it from",
    "words": [
    {
    "endTime": "475.200s",
    "startTime": "473.800s",
    "word": "how"
    },
    {
    "endTime": "475.500s",
    "startTime": "475.200s",
    "word": "far"
    },
    {
    "endTime": "475.700s",
    "startTime": "475.500s",
    "word": "is"
    },
    {
    "endTime": "475.800s",
    "startTime": "475.700s",
    "word": "it"
    },
    {
    "endTime": "476.100s",
    "startTime": "475.800s",
    "word": "from"
    }
    ]
    }
    ],
    "languageCode": "en-us"
    },
    {
    "alternatives": [
    {
    "confidence": 0.823343,
    "transcript": "I want",
    "words": [
    {
    "endTime": "629.200s",
    "startTime": "626.700s",
    "word": "I"
    },
    {
    "endTime": "629.800s",
    "startTime": "629.200s",
    "word": "want"
    }
    ]
    }
    ],
    "languageCode": "en-us"
    },
    {
    "alternatives": [
    {
    "confidence": 0.56559134,
    "transcript": "Blue Ivy",
    "words": [
    {
    "endTime": "990.100s",
    "startTime": "989.500s",
    "word": "Blue"
    },
    {
    "endTime": "991.100s",
    "startTime": "990.100s",
    "word": "Ivy"
    }
    ]
    }
    ],
    "languageCode": "en-us"
    },
    {
    "alternatives": [
    {
    "confidence": 0.78465956,
    "transcript": "how old is Wawa",
    "words": [
    {
    "endTime": "1599.700s",
    "startTime": "1598.500s",
    "word": "how"
    },
    {
    "endTime": "1600.100s",
    "startTime": "1599.700s",
    "word": "old"
    },
    {
    "endTime": "1600.200s",
    "startTime": "1600.100s",
    "word": "is"
    },
    {
    "endTime": "1600.600s",
    "startTime": "1600.200s",
    "word": "Wawa"
    }
    ]
    }
    ],
    "languageCode": "en-us"
    },
    {
    "alternatives": [
    {
    "confidence": 0.9475956,
    "transcript": "how are you",
    "words": [
    {
    "endTime": "2022.400s",
    "startTime": "2020s",
    "word": "how"
    },
    {
    "endTime": "2022.500s",
    "startTime": "2022.400s",
    "word": "are"
    },
    {
    "endTime": "2022.600s",
    "startTime": "2022.500s",
    "word": "you"
    }
    ]
    }
    ],
    "languageCode": "en-us"
    },
    {
    "alternatives": [
    {
    "confidence": 0.7494768,
    "transcript": "New York mall",
    "words": [
    {
    "endTime": "2066.200s",
    "startTime": "2065.800s",
    "word": "New"
    },
    {
    "endTime": "2066.500s",
    "startTime": "2066.200s",
    "word": "York"
    },
    {
    "endTime": "2067s",
    "startTime": "2066.500s",
    "word": "mall"
    }
    ]
    }
    ],
    "languageCode": "en-us"
    },
    {
    "alternatives": [
    {
    "confidence": 0.6706576,
    "transcript": "call",
    "words": [
    {
    "endTime": "2255.600s",
    "startTime": "2254.500s",
    "word": "call"
    }
    ]
    }
    ],
    "languageCode": "en-us"
    },
    {
    "alternatives": [
    {
    "confidence": 0.87819797,
    "transcript": "call Paul Wall",
    "words": [
    {
    "endTime": "3041.500s",
    "startTime": "3040.300s",
    "word": "call"
    },
    {
    "endTime": "3041.800s",
    "startTime": "3041.500s",
    "word": "Paul"
    },
    {
    "endTime": "3042.300s",
    "startTime": "3041.800s",
    "word": "Wall"
    }
    ]
    }
    ],
    "languageCode": "en-us"
    },
    {
    "alternatives": [
    {
    "confidence": 0.8331511,
    "transcript": "no",
    "words": [
    {
    "endTime": "3101.300s",
    "startTime": "3100.800s",
    "word": "no"
    }
    ]
    }
    ],
    "languageCode": "en-us"
    },
    {
    "alternatives": [
    {
    "confidence": 0.62488914,
    "transcript": "call Jeff",
    "words": [
    {
    "endTime": "3473.100s",
    "startTime": "3470.300s",
    "word": "call"
    },
    {
    "endTime": "3473.500s",
    "startTime": "3473.100s",
    "word": "Jeff"
    }
    ]
    }
    ],
    "languageCode": "en-us"
    },
    {
    "alternatives": [
    {
    "confidence": 0.9074697,
    "transcript": "call home",
    "words": [
    {
    "endTime": "4166.100s",
    "startTime": "4162.400s",
    "word": "call"
    },
    {
    "endTime": "4166.400s",
    "startTime": "4166.100s",
    "word": "home"
    }
    ]
    }
    ],
    "languageCode": "en-us"
    },
    {
    "alternatives": [
    {
    "confidence": 0.7917781,
    "transcript": "how old are you",
    "words": [
    {
    "endTime": "4231.800s",
    "startTime": "4231.300s",
    "word": "how"
    },
    {
    "endTime": "4232.200s",
    "startTime": "4231.800s",
    "word": "old"
    },
    {
    "endTime": "4232.300s",
    "startTime": "4232.200s",
    "word": "are"
    },
    {
    "endTime": "4232.400s",
    "startTime": "4232.300s",
    "word": "you"
    }
    ]
    }
    ],
    "languageCode": "en-us"
    },
    {
    "alternatives": [
    {
    "confidence": 0.70297575,
    "transcript": " Europe",
    "words": [
    {
    "endTime": "4244.200s",
    "startTime": "4243s",
    "word": "Europe"
    }
    ]
    }
    ],
    "languageCode": "en-us"
    },
    {
    "alternatives": [
    {
    "confidence": 0.84273374,
    "transcript": " how are you",
    "words": [
    {
    "endTime": "5121.500s",
    "startTime": "5115.300s",
    "word": "how"
    },
    {
    "endTime": "5122.100s",
    "startTime": "5121.500s",
    "word": "are"
    },
    {
    "endTime": "5122.300s",
    "startTime": "5122.100s",
    "word": "you"
    }
    ]
    }
    ],
    "languageCode": "en-us"
    },
    {
    "alternatives": [
    {
    "confidence": 0.7561751,
    "transcript": " the only one",
    "words": [
    {
    "endTime": "6199.900s",
    "startTime": "6199.600s",
    "word": "the"
    },
    {
    "endTime": "6200.400s",
    "startTime": "6199.900s",
    "word": "only"
    },
    {
    "endTime": "6200.800s",
    "startTime": "6200.400s",
    "word": "one"
    }
    ]
    }
    ],
    "languageCode": "en-us"
    },
    {
    "alternatives": [
    {
    "confidence": 0.6547922,
    "transcript": " call",
    "words": [
    {
    "endTime": "6258.800s",
    "startTime": "6256.800s",
    "word": "call"
    }
    ]
    }
    ],
    "languageCode": "en-us"
    },
    {
    "alternatives": [
    {
    "confidence": 0.9402823,
    "transcript": " Walgreens",
    "words": [
    {
    "endTime": "6925s",
    "startTime": "6912.300s",
    "word": "Walgreens"
    }
    ]
    }
    ],
    "languageCode": "en-us"
    },
    {
    "alternatives": [
    {
    "confidence": 0.5217668,
    "transcript": " we want to watch",
    "words": [
    {
    "endTime": "7155.900s",
    "startTime": "7155.500s",
    "word": "we"
    },
    {
    "endTime": "7156.500s",
    "startTime": "7155.900s",
    "word": "want"
    },
    {
    "endTime": "7156.600s",
    "startTime": "7156.500s",
    "word": "to"
    },
    {
    "endTime": "7156.700s",
    "startTime": "7156.600s",
    "word": "watch"
    }
    ]
    }
    ],
    "languageCode": "en-us"
    },
    {
    "alternatives": [
    {
    "confidence": 0.7971729,
    "transcript": " I love you",
    "words": [
    {
    "endTime": "7199.900s",
    "startTime": "7199.200s",
    "word": "I"
    },
    {
    "endTime": "7202.900s",
    "startTime": "7199.900s",
    "word": "love"
    },
    {
    "endTime": "7203.100s",
    "startTime": "7202.900s",
    "word": "you"
    }
    ]
    }
    ],
    "languageCode": "en-us"
    },
    {
    "alternatives": [
    {
    "confidence": 0.8566783,
    "transcript": " how old is Moana",
    "words": [
    {
    "endTime": "7483.800s",
    "startTime": "7481.300s",
    "word": "how"
    },
    {
    "endTime": "7484s",
    "startTime": "7483.800s",
    "word": "old"
    },
    {
    "endTime": "7484.200s",
    "startTime": "7484s",
    "word": "is"
    },
    {
    "endTime": "7484.300s",
    "startTime": "7484.200s",
    "word": "Moana"
    }
    ]
    }
    ],
    "languageCode": "en-us"
    },
    {
    "alternatives": [
    {
    "words": [
    {
    "endTime": "11.300s",
    "speakerTag": 1,
    "startTime": "10.400s",
    "word": "love"
    },
    {
    "endTime": "425.100s",
    "speakerTag": 1,
    "startTime": "424.500s",
    "word": "you"
    },
    {
    "endTime": "425.400s",
    "speakerTag": 1,
    "startTime": "425.100s",
    "word": "are"
    },
    {
    "endTime": "475.200s",
    "speakerTag": 1,
    "startTime": "473.800s",
    "word": "how"
    },
    {
    "endTime": "475.500s",
    "speakerTag": 1,
    "startTime": "475.200s",
    "word": "far"
    },
    {
    "endTime": "475.700s",
    "speakerTag": 1,
    "startTime": "475.500s",
    "word": "is"
    },
    {
    "endTime": "475.800s",
    "speakerTag": 1,
    "startTime": "475.700s",
    "word": "it"
    },
    {
    "endTime": "476.100s",
    "speakerTag": 1,
    "startTime": "475.800s",
    "word": "from"
    },
    {
    "endTime": "629.200s",
    "speakerTag": 1,
    "startTime": "626.700s",
    "word": "I"
    },
    {
    "endTime": "629.800s",
    "speakerTag": 1,
    "startTime": "629.200s",
    "word": "want"
    },
    {
    "endTime": "990.100s",
    "speakerTag": 1,
    "startTime": "989.500s",
    "word": "Blue"
    },
    {
    "endTime": "991.100s",
    "speakerTag": 1,
    "startTime": "990.100s",
    "word": "Ivy"
    },
    {
    "endTime": "1599.700s",
    "speakerTag": 1,
    "startTime": "1598.500s",
    "word": "how"
    },
    {
    "endTime": "1600.100s",
    "speakerTag": 1,
    "startTime": "1599.700s",
    "word": "old"
    },
    {
    "endTime": "1600.200s",
    "speakerTag": 1,
    "startTime": "1600.100s",
    "word": "is"
    },
    {
    "endTime": "1600.600s",
    "speakerTag": 1,
    "startTime": "1600.200s",
    "word": "Wawa"
    },
    {
    "endTime": "2022.400s",
    "speakerTag": 1,
    "startTime": "2020s",
    "word": "how"
    },
    {
    "endTime": "2022.500s",
    "speakerTag": 1,
    "startTime": "2022.400s",
    "word": "are"
    },
    {
    "endTime": "2022.600s",
    "speakerTag": 1,
    "startTime": "2022.500s",
    "word": "you"
    },
    {
    "endTime": "2066.200s",
    "speakerTag": 1,
    "startTime": "2065.800s",
    "word": "New"
    },
    {
    "endTime": "2066.500s",
    "speakerTag": 1,
    "startTime": "2066.200s",
    "word": "York"
    },
    {
    "endTime": "2067s",
    "speakerTag": 1,
    "startTime": "2066.500s",
    "word": "mall"
    },
    {
    "endTime": "2255.600s",
    "speakerTag": 1,
    "startTime": "2254.500s",
    "word": "call"
    },
    {
    "endTime": "3041.500s",
    "speakerTag": 1,
    "startTime": "3040.300s",
    "word": "call"
    },
    {
    "endTime": "3041.800s",
    "speakerTag": 1,
    "startTime": "3041.500s",
    "word": "Paul"
    },
    {
    "endTime": "3042.300s",
    "speakerTag": 1,
    "startTime": "3041.800s",
    "word": "Wall"
    },
    {
    "endTime": "3101.300s",
    "speakerTag": 1,
    "startTime": "3100.800s",
    "word": "no"
    },
    {
    "endTime": "3473.100s",
    "speakerTag": 1,
    "startTime": "3470.300s",
    "word": "call"
    },
    {
    "endTime": "3473.500s",
    "speakerTag": 1,
    "startTime": "3473.100s",
    "word": "Jeff"
    },
    {
    "endTime": "4166.100s",
    "speakerTag": 1,
    "startTime": "4162.400s",
    "word": "call"
    },
    {
    "endTime": "4166.400s",
    "speakerTag": 1,
    "startTime": "4166.100s",
    "word": "home"
    },
    {
    "endTime": "4231.800s",
    "speakerTag": 1,
    "startTime": "4231.300s",
    "word": "how"
    },
    {
    "endTime": "4232.200s",
    "speakerTag": 1,
    "startTime": "4231.800s",
    "word": "old"
    },
    {
    "endTime": "4232.300s",
    "speakerTag": 1,
    "startTime": "4232.200s",
    "word": "are"
    },
    {
    "endTime": "4232.400s",
    "speakerTag": 1,
    "startTime": "4232.300s",
    "word": "you"
    },
    {
    "endTime": "4244.200s",
    "speakerTag": 1,
    "startTime": "4243s",
    "word": "Europe"
    },
    {
    "endTime": "5121.500s",
    "speakerTag": 1,
    "startTime": "5115.300s",
    "word": "how"
    },
    {
    "endTime": "5122.100s",
    "speakerTag": 1,
    "startTime": "5121.500s",
    "word": "are"
    },
    {
    "endTime": "5122.300s",
    "speakerTag": 1,
    "startTime": "5122.100s",
    "word": "you"
    },
    {
    "endTime": "6199.900s",
    "speakerTag": 1,
    "startTime": "6199.600s",
    "word": "the"
    },
    {
    "endTime": "6200.400s",
    "speakerTag": 1,
    "startTime": "6199.900s",
    "word": "only"
    },
    {
    "endTime": "6200.800s",
    "speakerTag": 1,
    "startTime": "6200.400s",
    "word": "one"
    },
    {
    "endTime": "6258.800s",
    "speakerTag": 1,
    "startTime": "6256.800s",
    "word": "call"
    },
    {
    "endTime": "6925s",
    "speakerTag": 1,
    "startTime": "6912.300s",
    "word": "Walgreens"
    },
    {
    "endTime": "7155.900s",
    "speakerTag": 1,
    "startTime": "7155.500s",
    "word": "we"
    },
    {
    "endTime": "7156.500s",
    "speakerTag": 1,
    "startTime": "7155.900s",
    "word": "want"
    },
    {
    "endTime": "7156.600s",
    "speakerTag": 1,
    "startTime": "7156.500s",
    "word": "to"
    },
    {
    "endTime": "7156.700s",
    "speakerTag": 1,
    "startTime": "7156.600s",
    "word": "watch"
    },
    {
    "endTime": "7199.900s",
    "speakerTag": 1,
    "startTime": "7199.200s",
    "word": "I"
    },
    {
    "endTime": "7202.900s",
    "speakerTag": 1,
    "startTime": "7199.900s",
    "word": "love"
    },
    {
    "endTime": "7203.100s",
    "speakerTag": 1,
    "startTime": "7202.900s",
    "word": "you"
    },
    {
    "endTime": "7483.800s",
    "speakerTag": 1,
    "startTime": "7481.300s",
    "word": "how"
    },
    {
    "endTime": "7484s",
    "speakerTag": 1,
    "startTime": "7483.800s",
    "word": "old"
    },
    {
    "endTime": "7484.200s",
    "speakerTag": 1,
    "startTime": "7484s",
    "word": "is"
    },
    {
    "endTime": "7484.300s",
    "speakerTag": 1,
    "startTime": "7484.200s",
    "word": "Moana"
    }
    ]
    }
    ]
    }
    ]
    }
    }

    最佳答案

    我遇到了同样的问题,特别是与性能不佳的分类有关。
    我也尝试从 AWS 获取我的脚本,但我发现单词错误率更高,但更好地识别人与人之间的转换。
    如您所知,这是一个测试版功能,在那个阶段他们没有 SLA(服务水平协议(protocol))来完成。
    我向谷歌团队报告了这个错误,他们回复了:

    There are no SLAs or technical support obligations in a beta releaseunless otherwise specified in product terms[...]. The average betaphase lasts about six months.


    所以我相信团队正式发布这个功能还需要一段时间。
    https://cloud.google.com/speech-to-text/docs/multiple-voices

    关于google-cloud-platform - Google Cloud Speech-to-text 非常不准确,最后一个结果包含所有其他结果,并且仅在最后一个结果中包含 speakerTag,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/62883413/

    27 4 0
    Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
    广告合作:1813099741@qq.com 6ren.com