Parse with OCR - VB.NET
Document Parser SDK sample in VB.NET demonstrating ‘Parse with OCR’
DigitalOcean.yml
templateName: DigitalOcean Invoice
templateVersion: 4
templatePriority: 0
detectionRules:
keywords:
- DigitalOcean
- 101 Avenue of the Americas
- Invoice Number
objects:
- name: companyName
objectType: field
fieldProperties:
fieldType: static
expression: DigitalOcean
regex: true
- name: invoiceId
objectType: field
fieldProperties:
fieldType: macros
expression: 'Invoice Number: ({{Digits}})'
regex: true
- name: dateIssued
objectType: field
fieldProperties:
fieldType: macros
expression: 'Date Issued: ({{SmartDate}})'
regex: true
dataType: date
dateFormat: auto-mdy
- name: total
objectType: field
fieldProperties:
fieldType: macros
expression: 'Total: ({{Money}})'
regex: true
dataType: decimal
- name: currency
objectType: field
fieldProperties:
fieldType: static
expression: USD
regex: true
- name: table1
objectType: table
tableProperties:
start:
expression: Description{{Spaces}}Hours
regex: true
end:
expression: 'Total:'
regex: true
row:
expression: '{{LineStart}}{{Spaces}}(?<description>{{SentenceWithSingleSpaces}}){{Spaces}}(?<hours>{{Digits}}){{Spaces}}(?<start>{{2Digits}}{{Minus}}{{2Digits}}{{Space}}{{2Digits}}{{Colon}}{{2Digits}}){{Spaces}}(?<end>{{2Digits}}{{Minus}}{{2Digits}}{{Space}}{{2Digits}}{{Colon}}{{2Digits}}){{Spaces}}{{Dollar}}(?<unitPrice>{{Number}})'
regex: true
columns:
- name: hours
dataType: integer
- name: unitPrice
dataType: decimal
Module1.vb
Imports ByteScout.DocumentParser
' This example demonstrates parsing of scanned documents
' using the Optical Character Recognition (OCR).
Module Module1
Sub Main()
Dim template As String = ".\DigitalOcean.yml"
Dim inputDocument As String = ".\DigitalOcean-scanned.jpg"
' Create and activate DocumentParser instance
Using documentParser As New DocumentParser("demo", "demo")
' Enable Optical Character Recognition (OCR) in Auto mode
' (DocumentParser automatically detects if OCR Is required).
documentParser.OCRMode = OCRMode.Auto
' Set PDF document rendering resolution
documentParser.OCRResolution = 300
' Set the location of OCR language data files
documentParser.OCRLanguageDataFolder = "c:\Program Files\ByteScout Document Parser SDK\ocrdata"
' Set OCR language
' "eng" for english, "deu" for German, "fra" for French, etc. - according to files in "ocrdata" folder
documentParser.OCRLanguage = "eng"
' Find more language files at https://github.com/bytescout/ocrdata
' Note: The OCRLanguage can be overridden in a template.
' See the Template Creation Guide.
' You can also apply various preprocessing filters
' to improve the recognition on low-quality scans.
' Automatically deskew skewed scans
'documentParser.OCRImagePreprocessingFilters.AddDeskew()
' Remove vertical or horizontal lines (sometimes helps to avoid OCR engine's page segmentation errors)
'documentParser.OCRImagePreprocessingFilters.AddVerticalLinesRemover()
'documentParser.OCRImagePreprocessingFilters.AddHorizontalLinesRemover()
' Repair broken letters
'documentParser.OCRImagePreprocessingFilters.AddDilate()
' Remove noise
'documentParser.OCRImagePreprocessingFilters.AddMedian()
' Apply Gamma Correction
'documentParser.OCRImagePreprocessingFilters.AddGammaCorrection(1.4)
' Add Contrast
'documentParser.OCRImagePreprocessingFilters.AddContrast(20)
' Load template
documentParser.AddTemplate(template)
Console.WriteLine("Template loaded.")
Console.WriteLine()
Console.WriteLine($"Parsing ""{inputDocument}"" with OCR...")
Console.WriteLine()
' Parse document data to JSON format
Dim jsonString As String = documentParser.ParseDocument(inputDocument, OutputFormat.JSON)
' Display parsed data in console
Console.WriteLine("Parsed data in JSON format:")
Console.WriteLine()
Console.WriteLine(jsonString)
End Using
Console.WriteLine()
Console.WriteLine("Press any key to continue...")
Console.ReadLine()
End Sub
End Module