-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_corpus.py
53 lines (42 loc) · 1.89 KB
/
create_corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import os
import random
import click
from tqdm import tqdm
def create_sentences(directory, step_size):
sentences = []
# Iterate over the text files in the directory
for filename in tqdm(os.listdir(directory)):
if filename.endswith(".txt"):
file_path = os.path.join(directory, filename)
# Read the contents of the file
with open(file_path, 'r') as file:
content = file.read()
words = content.split()
# Generate sentences of length 9 with specified step size
for i in range(0, len(words) - 8, step_size):
sentence = ' '.join(words[i:i+9])
sentences.append(sentence)
# Shuffle the sentences randomly
random.shuffle(sentences)
# Split the sentences into train and test sets
train_size = int(0.9 * len(sentences))
train_sentences = sentences[:train_size]
test_sentences = sentences[train_size:]
# Write the train sentences to train.txt
with open('train.txt', 'w') as train_file:
train_file.write('\n'.join(train_sentences))
# Write the test sentences to test.txt
with open('test.txt', 'w') as test_file:
test_file.write('\n'.join(test_sentences))
print(f"Created {len(train_sentences)} sentences in train.txt")
print(f"Created {len(test_sentences)} sentences in test.txt")
@click.command()
@click.option('--directory',
type=click.Path(exists=True, file_okay=False, dir_okay=True),
help='Directory containing the text files')
@click.option('--step-size', default=3, type=int, help='Step size for generating sentences')
def main(directory, step_size):
"""Create sentences from text files in the specified directory with given step size."""
create_sentences(directory, step_size)
if __name__ == '__main__':
main()