Skip to content

Commit

Permalink
OCR: preserve whitespace, use worker
Browse files Browse the repository at this point in the history
Closes #7
  • Loading branch information
stscoundrel committed Nov 19, 2023
1 parent 5a49372 commit b4fd591
Showing 1 changed file with 13 additions and 11 deletions.
24 changes: 13 additions & 11 deletions image-to-text/src/app.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,16 @@ import { chunkArray } from "skilja";
const inputPath = "./resources/images";
const outputPath = "./resources/text";

const getWorker = async (): Promise<Tesseract.Worker> => {
const worker = await Tesseract.createWorker("dan");

worker.setParameters({
preserve_interword_spaces: "1",
});

return worker;
};

async function processImages() {
try {
const allFiles = await fs.promises.readdir(inputPath);
Expand All @@ -14,20 +24,12 @@ async function processImages() {
const imageProcessingPromises = batches[i].map(async (file) => {
const imagePath = `${inputPath}/${file}`;

const worker = await getWorker();

// Perform OCR on the current image
const {
data: { text },
} = await Tesseract.recognize(
imagePath,
"dan", // Danish language.
// Typing wont work for these options.
{
// @ts-ignore
// TODO: does not seem to handle two columns.
// Can settings help with this, or should it be parsed separately later?
tessedit_pageseg_mode: Tesseract.PSM.AUTO_OSD, // Page segmentation mode
},
);
} = await worker.recognize(imagePath);

// Output the extracted text to a file
const outputFilePath = `${outputPath}/${file}.txt`.replace(".gif", "");
Expand Down

0 comments on commit b4fd591

Please sign in to comment.