PDF and OCR (Optical Character Recognition) - C#
PDF Extractor SDK sample in C# demonstrating ‘PDF and OCR (Optical Character Recognition)’
Program.cs
using System.Diagnostics;
using Bytescout.PDFExtractor;
// This example demonstrates the use of Optical Character Recognition (OCR) to extract text
// from scanned PDF documents and raster images.
// To make OCR work you should add the following references to your project:
// 'Bytescout.PDFExtractor.dll', 'Bytescout.PDFExtractor.OCRExtension.dll'.
namespace OCRExample
{
class Program
{
static void Main(string[] args)
{
// Create Bytescout.PDFExtractor.TextExtractor instance
TextExtractor extractor = new TextExtractor();
extractor.RegistrationName = "demo";
extractor.RegistrationKey = "demo";
// Load sample PDF document
extractor.LoadDocumentFromFile("sample_ocr.pdf");
// Enable Optical Character Recognition (OCR)
// in .Auto mode (SDK automatically checks if needs to use OCR or not)
extractor.OCRMode = OCRMode.Auto;
// Set the location of OCR language data files
extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\";
// Set OCR language
extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder
// Find more language files at https://github.com/bytescout/ocrdata
// Set PDF document rendering resolution
extractor.OCRResolution = 300;
// You can also apply various preprocessing filters
// to improve the recognition on low-quality scans.
// Automatically deskew skewed scans
//extractor.OCRImagePreprocessingFilters.AddDeskew();
// Remove vertical or horizontal lines (sometimes helps to avoid OCR engine's page segmentation errors)
//extractor.OCRImagePreprocessingFilters.AddVerticalLinesRemover();
//extractor.OCRImagePreprocessingFilters.AddHorizontalLinesRemover();
// Repair broken letters
//extractor.OCRImagePreprocessingFilters.AddDilate();
// Remove noise
//extractor.OCRImagePreprocessingFilters.AddMedian();
// Apply Gamma Correction
//extractor.OCRImagePreprocessingFilters.AddGammaCorrection();
// Add Contrast
//extractor.OCRImagePreprocessingFilters.AddContrast(20);
// (!) You can use new OCRAnalyser class to find an optimal set of image preprocessing
// filters for your specific document.
// See "OCR Analyser" example.
// Save extracted text to file
extractor.SaveTextToFile("output.txt");
// Cleanup
extractor.Dispose();
// Open result document in default associated application (for demo purpose)
ProcessStartInfo processStartInfo = new ProcessStartInfo("output.txt");
processStartInfo.UseShellExecute = true;
Process.Start(processStartInfo);
}
}
}