From fac62aaccc15b661b090bcaeb6cc8c3ae3fde75d Mon Sep 17 00:00:00 2001
From: makaveli10 <vineet.suryan@collabora.com>
Date: Thu, 8 Aug 2024 06:05:12 -0400
Subject: [PATCH] Fix hallucinations with no_speech_thres

Signed-off-by: makaveli10 <vineet.suryan@collabora.com>
---
 whisper_live/server.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/whisper_live/server.py b/whisper_live/server.py
index b6efc77..b668a6d 100644
--- a/whisper_live/server.py
+++ b/whisper_live/server.py
@@ -973,6 +973,7 @@ def speech_to_text(self):
 
             input_bytes, duration = self.get_audio_chunk_for_processing()
             if duration < 1.0:
+                time.sleep(0.1)     # wait for audio chunks to arrive
                 continue
             try:
                 input_sample = input_bytes.copy()
@@ -1046,12 +1047,14 @@ def update_segments(self, segments, duration):
                 self.transcript.append(self.format_segment(start, end, text_))
                 offset = min(duration, s.end)
 
-        self.current_out += segments[-1].text
-        last_segment = self.format_segment(
-            self.timestamp_offset + segments[-1].start,
-            self.timestamp_offset + min(duration, segments[-1].end),
-            self.current_out
-        )
+        # only process the segments if it satisfies the no_speech_thresh
+        if segments[-1].no_speech_prob <= self.no_speech_thresh:
+            self.current_out += segments[-1].text
+            last_segment = self.format_segment(
+                self.timestamp_offset + segments[-1].start,
+                self.timestamp_offset + min(duration, segments[-1].end),
+                self.current_out
+            )
 
         # if same incomplete segment is seen multiple times then update the offset
         # and append the segment to the list