Find Text in PDF using Regex - C#
PDF Extractor SDK sample in C# demonstrating ‘Find Text in PDF using Regex’
Program.cs
using System;
using Bytescout.PDFExtractor;
namespace FindText
{
// Note: if you are looking for a more higher level API to extract data from invoices, reports, statements
// then please check Document Parser SDK and Web API at https://bytescout.com/products/developer/documentparsersdk/index.html
// and https://pdf.co for secure and scalable web api
class Program
{
static void Main(string[] args)
{
// Create Bytescout.PDFExtractor.TextExtractor instance
TextExtractor extractor = new TextExtractor();
extractor.RegistrationName = "demo";
extractor.RegistrationKey = "demo";
// Load sample PDF document
extractor.LoadDocumentFromFile(@".\Invoice.pdf");
extractor.RegexSearch = true; // Enable the regular expressions
int pageCount = extractor.GetPageCount();
// Search through pages
for (int i = 0; i < pageCount; i++)
{
// Search dates in format 12/31/1999
string regexPattern = "[0-9]{2}/[0-9]{2}/[0-9]{4}";
// See the complete regular expressions reference at https://msdn.microsoft.com/en-us/library/az24scfc(v=vs.110).aspx
// Search each page for the pattern
if (extractor.Find(i, regexPattern, false))
{
do
{
Console.WriteLine("");
Console.WriteLine("Found on page " + i + " at location " + extractor.FoundText.Bounds);
Console.WriteLine("");
// Iterate through each element in the found text
foreach (ISearchResultElement element in extractor.FoundText.Elements)
{
Console.WriteLine(" Text: " + element.Text);
Console.WriteLine(" Font is bold: " + element.FontIsBold);
Console.WriteLine(" Font is italic: " + element.FontIsItalic);
Console.WriteLine(" Font name: " + element.FontName);
Console.WriteLine(" Font size: " + element.FontSize);
Console.WriteLine(" Font color: " + element.FontColor);
Console.WriteLine();
}
}
while (extractor.FindNext());
}
}
// Cleanup
extractor.Dispose();
Console.WriteLine();
Console.WriteLine("Press any key to continue...");
Console.ReadLine();
}
}
}