OCR Modes - VB.NET
PDF Extractor SDK sample in VB.NET demonstrating ‘OCR Modes’
Program.vb
Imports Bytescout.PDFExtractor
' To make OCR work you should add the following references to your project:
' 'Bytescout.PDFExtractor.dll', 'Bytescout.PDFExtractor.OCRExtension.dll'.
Class Program
Friend Shared Sub Main(args As String())
' Input document
Dim inputDocument As String = ".\SampleWith_Vector_Image_Font.pdf"
' Extracting text with different OCRModes
' 1. TextFromImagesOnly (Plain Mode)
Console.WriteLine("---------------------------------" + Environment.NewLine + "Extraction Mode: TextFromImagesOnly " + Environment.NewLine + "---------------------------------")
Dim resultText As String = _ExtractTextWithSpecificOCRMode(inputDocument, OCRMode.TextFromImagesOnly)
Console.WriteLine(resultText)
' 2. TextFromVectorOnly (Plain Mode)
Console.WriteLine("---------------------------------" + Environment.NewLine + "Extraction Mode: TextFromVectorOnly " + Environment.NewLine + "---------------------------------")
resultText = _ExtractTextWithSpecificOCRMode(inputDocument, OCRMode.TextFromVectorsOnly)
Console.WriteLine(resultText)
' 3. TextFromImagesAndFonts (Combined Mode)
Console.WriteLine("---------------------------------" + Environment.NewLine + "Extraction Mode: TextFromImagesAndFonts " + Environment.NewLine + "---------------------------------")
resultText = _ExtractTextWithSpecificOCRMode(inputDocument, OCRMode.TextFromImagesAndFonts)
Console.WriteLine(resultText)
' 4. TextFromImagesAndVectorsAndFonts (Combined Mode)
Console.WriteLine("---------------------------------" + Environment.NewLine + "Extraction Mode: TextFromImagesAndVectorsAndFonts " + Environment.NewLine + "---------------------------------")
resultText = _ExtractTextWithSpecificOCRMode(inputDocument, OCRMode.TextFromImagesAndVectorsAndFonts)
Console.WriteLine(resultText)
Console.ReadLine()
End Sub
''' <summary>
''' Extract text from document with specific Ocr Mode
''' </summary>
Friend Shared Function _ExtractTextWithSpecificOCRMode(inputDocument As String, ocrMode As OCRMode) As String
' Location of OCR language data files
Dim ocrLanguageDataFolder As String = "c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\"
' OCR language
Dim ocrLanguage As String = "eng" ' "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder
' Find more language files at https://github.com/bytescout/ocrdata/tree/master/ocrdata_best
' Create TextExtractor instance
Using textExtractor As TextExtractor = New TextExtractor("demo", "demo")
' Load document to TextExtractor
textExtractor.LoadDocumentFromFile(inputDocument)
' Specify Ocr Mode
textExtractor.OCRMode = ocrMode
' Ocr language data folder path and language
textExtractor.OCRLanguageDataFolder = ocrLanguageDataFolder
textExtractor.OCRLanguage = ocrLanguage
' Return extracted text
Return textExtractor.GetText()
End Using
End Function
End Class