Parse with OCR - C#
Document Parser SDK sample in C# demonstrating ‘Parse with OCR’
DigitalOcean.yml
templateName: DigitalOcean Invoice
templateVersion: 4
templatePriority: 0
detectionRules:
keywords:
- DigitalOcean
- 101 Avenue of the Americas
- Invoice Number
objects:
- name: companyName
objectType: field
fieldProperties:
fieldType: static
expression: DigitalOcean
regex: true
- name: invoiceId
objectType: field
fieldProperties:
fieldType: macros
expression: 'Invoice Number: ({{Digits}})'
regex: true
- name: dateIssued
objectType: field
fieldProperties:
fieldType: macros
expression: 'Date Issued: ({{SmartDate}})'
regex: true
dataType: date
dateFormat: auto-mdy
- name: total
objectType: field
fieldProperties:
fieldType: macros
expression: 'Total: ({{Money}})'
regex: true
dataType: decimal
- name: currency
objectType: field
fieldProperties:
fieldType: static
expression: USD
regex: true
- name: table1
objectType: table
tableProperties:
start:
expression: Description{{Spaces}}Hours
regex: true
end:
expression: 'Total:'
regex: true
row:
expression: '{{LineStart}}{{Spaces}}(?<description>{{SentenceWithSingleSpaces}}){{Spaces}}(?<hours>{{Digits}}){{Spaces}}(?<start>{{2Digits}}{{Minus}}{{2Digits}}{{Space}}{{2Digits}}{{Colon}}{{2Digits}}){{Spaces}}(?<end>{{2Digits}}{{Minus}}{{2Digits}}{{Space}}{{2Digits}}{{Colon}}{{2Digits}}){{Spaces}}{{Dollar}}(?<unitPrice>{{Number}})'
regex: true
columns:
- name: hours
dataType: integer
- name: unitPrice
dataType: decimal
Program.cs
using System;
using ByteScout.DocumentParser;
// This example demonstrates parsing of scanned documents
// using the Optical Character Recognition (OCR).
namespace GeneralExample
{
class Program
{
static void Main(string[] args)
{
string template = @".\DigitalOcean.yml";
string inputDocument = @".\DigitalOcean-scanned.jpg";
// Create and activate DocumentParser instance
using (DocumentParser documentParser = new DocumentParser("demo", "demo"))
{
// Enable Optical Character Recognition (OCR) in Auto mode
// (DocumentParser automatically detects if OCR is required).
documentParser.OCRMode = OCRMode.Auto;
// Set PDF document rendering resolution
documentParser.OCRResolution = 300;
// Set the location of OCR language data files
documentParser.OCRLanguageDataFolder = @"c:\Program Files\ByteScout Document Parser SDK\ocrdata";
// Set OCR language
// "eng" for english, "deu" for German, "fra" for French, etc. - according to files in "ocrdata" folder
documentParser.OCRLanguage = "eng";
// Find more language files at https://github.com/bytescout/ocrdata
// Note: The OCRLanguage can be overridden in a template.
// See the Template Creation Guide.
// You can also apply various preprocessing filters
// to improve the recognition on low-quality scans.
// Automatically deskew skewed scans
//documentParser.OCRImagePreprocessingFilters.AddDeskew();
// Remove vertical or horizontal lines (sometimes helps to avoid OCR engine's page segmentation errors)
//documentParser.OCRImagePreprocessingFilters.AddVerticalLinesRemover();
//documentParser.OCRImagePreprocessingFilters.AddHorizontalLinesRemover();
// Repair broken letters
//documentParser.OCRImagePreprocessingFilters.AddDilate();
// Remove noise
//documentParser.OCRImagePreprocessingFilters.AddMedian();
// Apply Gamma Correction
//documentParser.OCRImagePreprocessingFilters.AddGammaCorrection(1.4);
// Add Contrast
//documentParser.OCRImagePreprocessingFilters.AddContrast(20);
// Load template
documentParser.AddTemplate(template);
Console.WriteLine("Template loaded.");
Console.WriteLine();
Console.WriteLine($"Parsing \"{inputDocument}\" with OCR...");
Console.WriteLine();
// Parse document data to JSON format
string jsonString = documentParser.ParseDocument(inputDocument, OutputFormat.JSON);
// Display parsed data in console
Console.WriteLine("Parsed data in JSON format:");
Console.WriteLine();
Console.WriteLine(jsonString);
}
Console.WriteLine();
Console.WriteLine("Press any key to continue...");
Console.ReadLine();
}
}
}