Split PDF Document By Text - C#
PDF Extractor SDK sample in C# demonstrating ‘Split PDF Document By Text’
Program.cs
using System;
using System.IO;
using Bytescout.PDFExtractor;
namespace SplittingExample
{
class Program
{
static void Main(string[] args)
{
// Input File
string inputFile = "sample.pdf";
// Output FOlder
string outputFolder = "./output";
// Regex-Enabled - Search for multiple strings
//String[] pattern_list = new string[] {"Invoice", "1nvoice"};
//String searchString = String.Format("({0})", String.Join("|", pattern_list));
//bool isRegexEnabled = true;
// Search single string
string searchString = "Invoice";
bool isRegexEnabled = false;
// Registration name/key
string registration_name = "demo";
string registration_key = "demo";
using (var extractor = new DocumentSplitter2(registration_name, registration_key))
{
// Enable Optical Character Recognition (OCR)
// in .Auto mode (SDK automatically checks if needs to use OCR or not)
extractor.OCRMode = OCRMode.Auto;
// Set the location of OCR language data files
extractor.OCRLanguageDataFolder = @"c:\Program Files\Bytescout PDF Extractor SDK\ocrdata_best\";
// Set OCR language
extractor.OCRLanguage = "eng"; // "eng" for english, "deu" for German, "fra" for French, "spa" for Spanish etc - according to files in "ocrdata" folder
// Find more language files at https://github.com/bytescout/ocrdata/tree/master/ocrdata
// Repair broken letters
extractor.OCRImagePreprocessingFilters.AddDilate();
// Remove noise
extractor.OCRImagePreprocessingFilters.AddMedian();
// Auto-Detect Page Rotation
extractor.OCRDetectPageRotation = true;
// Add Filters for Horizonal/Vertical Line
extractor.OCRImagePreprocessingFilters.AddHorizontalLinesRemover();
extractor.OCRImagePreprocessingFilters.AddVerticalLinesRemover();
// Perform Split by Text
var files = extractor.Split(inputFile, searchString,
caseSensitive: false,
useRegex: isRegexEnabled,
excludePageWithFoundText: false,
outputFolder: outputFolder);
// Display Output
Console.WriteLine(@"Splitted by Text: ");
foreach (string file in files)
Console.WriteLine(" " + Path.GetFileName(file));
}
Console.WriteLine();
Console.WriteLine("Press any key...");
Console.ReadKey();
}
}
}