From fac62aaccc15b661b090bcaeb6cc8c3ae3fde75d Mon Sep 17 00:00:00 2001 From: makaveli10 Date: Thu, 8 Aug 2024 06:05:12 -0400 Subject: [PATCH] Fix hallucinations with no_speech_thres Signed-off-by: makaveli10 --- whisper_live/server.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/whisper_live/server.py b/whisper_live/server.py index b6efc77..b668a6d 100644 --- a/whisper_live/server.py +++ b/whisper_live/server.py @@ -973,6 +973,7 @@ def speech_to_text(self): input_bytes, duration = self.get_audio_chunk_for_processing() if duration < 1.0: + time.sleep(0.1) # wait for audio chunks to arrive continue try: input_sample = input_bytes.copy() @@ -1046,12 +1047,14 @@ def update_segments(self, segments, duration): self.transcript.append(self.format_segment(start, end, text_)) offset = min(duration, s.end) - self.current_out += segments[-1].text - last_segment = self.format_segment( - self.timestamp_offset + segments[-1].start, - self.timestamp_offset + min(duration, segments[-1].end), - self.current_out - ) + # only process the segments if it satisfies the no_speech_thresh + if segments[-1].no_speech_prob <= self.no_speech_thresh: + self.current_out += segments[-1].text + last_segment = self.format_segment( + self.timestamp_offset + segments[-1].start, + self.timestamp_offset + min(duration, segments[-1].end), + self.current_out + ) # if same incomplete segment is seen multiple times then update the offset # and append the segment to the list