-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathaudiobook_with_users.py
163 lines (138 loc) · 5.43 KB
/
audiobook_with_users.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import os
from pathlib import Path
from openai import OpenAI
import subprocess
import re
import logging
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
openai = OpenAI(api_key="")
def read_book(file_path):
logging.info(f"Reading book from {file_path}")
with open(file_path, 'r', encoding='utf-8') as file:
return file.read()
messages = [
{
"role": "system",
"content": """
Help me guess speakers for different quotes in a paragraph.
Guess the gender of the speaker if you can. If unknown, say UNKNOWN.
Give output for each quote in the format: speaker,gender
"""
}
]
def identify_speaker(context, quote):
global messages
# Add the new user prompt as a message
user_message = {
"role": "user",
"content": f"Given the context: [{context}], who is most likely to say: [{quote}]?"
}
messages.append(user_message)
response = openai.chat.completions.create(
model="gpt-3.5-turbo",
messages=messages,
max_tokens=60
)
print(response)
# Update messages with the response
assistant_message = response.choices[0].message
response_message = {
"role": "assistant",
"content": assistant_message.content # Accessing content directly
}
messages.append(response_message)
# Process response
content = assistant_message.content
if content is None:
return ""
resp = content.strip()
try:
speaker = resp[0].strip()
gender = resp[1].strip() if len(resp) > 1 else "male" # Default to male if gender is unknown
except IndexError:
speaker = "UNKNOWN"
gender = "male" # Default to male if parsing fails
return speaker, gender
def assign_voice():
male_voices = ["Echo", "Onyx", "Fable"]
female_voices = ["Shimer", "Nova", "Alloy"]
voice_map = {}
male_index = [0] # Use a list to maintain state
female_index = [0]
def voice_picker(speaker, gender: str):
if speaker in voice_map:
return voice_map[speaker]
else:
if gender.lower() == 'male':
voice = male_voices[male_index[0] % len(male_voices)]
male_index[0] += 1
else:
voice = female_voices[female_index[0] % len(female_voices)]
female_index[0] += 1
voice_map[speaker] = voice
return voice
return voice_picker
def chunk_text(input_text):
chunks = []
paragraphs = input_text.split('\n\n')
logging.info(f"Split book into {len(paragraphs)} paragraphs")
for paragraph in paragraphs:
# Check if the paragraph is in square brackets
if paragraph.startswith('[') and paragraph.endswith(']'):
logging.info(f"Found system message of length: {len(paragraph)}")
# Remove the square brackets and treat as a system message
content = paragraph[1:-1]
chunks.append((('system', 'male'), content))
else:
# Find quotes within the paragraph
quotes = re.findall(r'"([^"]*)"', paragraph)
logging.info(f"Found {len(quotes)} quotes in paragraph of length: {len(paragraph)}")
start = 0
for quote in quotes:
end = paragraph.find(quote, start)
# Add non-quote text as narration
if start < end:
chunks.append((('narrator', 'male'), paragraph[start:end]))
# Add quote text with identified speaker
speaker_gender = identify_speaker(paragraph, quote)
chunks.append((speaker_gender, quote))
start = end + len(quote)
# Add remaining paragraph as narration
if start < len(paragraph):
chunks.append((('narrator', 'male'), paragraph[start:]))
return chunks
def generate_audio(chunk, voice, index, temp_folder):
speech_file_path = temp_folder / f"chunk_{index}.mp3"
if not speech_file_path.exists():
response = openai.audio.speech.create(
model="tts-1-hd",
voice=voice,
input=chunk
)
response.stream_to_file(speech_file_path)
return speech_file_path
def combine_audio(files, output_path):
command = ["ffmpeg", "-y", "-i", "concat:" + "|".join(files), "-acodec", "copy", output_path]
subprocess.run(command, check=True)
def main(book_file_path):
logging.info(f"Starting processing of book: {book_file_path}")
book_text = read_book(book_file_path)
temp_folder = Path(__file__).parent / "temp_audio_files"
temp_folder.mkdir(exist_ok=True)
logging.info(f"Temporary folder created at {temp_folder}")
voice_picker = assign_voice()
chunks = chunk_text(book_text)
audio_files = []
for index, (speaker_gender, chunk) in enumerate(chunks):
speaker, gender = speaker_gender
voice = voice_picker(speaker, gender)
logging.info(f"Generating audio for chunk {index} with speaker {speaker} and voice {voice}")
audio_path = generate_audio(chunk, voice, index, temp_folder)
audio_files.append(str(audio_path))
combined_audio_path = Path(__file__).parent / "combined_speech.mp3"
combine_audio(audio_files, str(combined_audio_path))
logging.info(f"Audio files combined into {combined_audio_path}")
# Example usage
book_file_path = Path(__file__).parent / "book.txt"
main(book_file_path)