PDF Invoice Parsing - VB.NET
PDF Extractor SDK sample in VB.NET demonstrating ‘PDF Invoice Parsing’
Module1.vb
Imports System.Drawing
Imports Bytescout.PDFExtractor
Module Module1
Sub Main()
' Create TextExtractor instance
Dim textExtractor = New TextExtractor("demo", "demo")
textExtractor.WordMatchingMode = WordMatchingMode.ExactMatch ' Set exact search (default is SmartSearch that works like in Adobe Reader)
' Create XMLExtractor instance
Dim xmlExtractor = New XMLExtractor("demo", "demo")
' Load document
textExtractor.LoadDocumentFromFile("Invoice.pdf")
xmlExtractor.LoadDocumentFromFile("Invoice.pdf")
' Results
Dim invoiceNo = String.Empty
Dim invoiceDate = String.Empty
Dim total = String.Empty
Dim tableData = String.Empty
' Iterate pages
For i As Integer = 0 To textExtractor.GetPageCount() - 1
Dim pageRectangle = textExtractor.GetPageRectangle(i)
Dim tableRect = New RectangleF(0, 0, pageRectangle.Width, 0)
' Search for "Invoice No."
If textExtractor.Find(i, "Invoice No.", False) Then
' Get the found text rectangle
Dim textRect = textExtractor.FoundText.Bounds
' Assume the text at right is the invoice number.
' Shift the rectangle to the right:
textRect.X = textRect.Right
textRect.Width = pageRectangle.Right - textRect.Left
' Set the extraction region and extract the text
textExtractor.SetExtractionArea(textRect)
invoiceNo = textExtractor.GetTextFromPage(i).Trim()
End If
' Search for "Invoice Date" and extract text at right
If textExtractor.Find(i, "Invoice Date", False) Then
Dim textRect = textExtractor.FoundText.Bounds
textRect.X = textRect.Right
textRect.Width = pageRectangle.Right - textRect.Left
textExtractor.SetExtractionArea(textRect)
invoiceDate = textExtractor.GetTextFromPage(i).Trim()
End If
' Search for "Quantity" keyword to detect the top of the tabular data rectangle
If textExtractor.Find(i, "Quantity", False) Then
' Keep the top table coordinate
tableRect.Y = textExtractor.FoundText.Bounds.Top ' use textRect.Bottom if you want to skip column headers
End If
' Search and extract "TOTAL" (it will be also the bottom of tabular data rectangle)
If textExtractor.Find(i, "TOTAL", True) Then ' case sensitive!
Dim textRect = textExtractor.FoundText.Bounds
textRect.X = textRect.Right
textRect.Width = pageRectangle.Right - textRect.Left
textExtractor.SetExtractionArea(textRect)
total = textExtractor.GetTextFromPage(i).Trim()
' Calculate the table height
tableRect.Height = textRect.Top - tableRect.Top
End If
' Extract tabular data using XMLExtractor
If tableRect.Height > 0 Then
xmlExtractor.SetExtractionArea(tableRect)
tableData = xmlExtractor.GetXMLFromPage(i)
End If
Next
' Display extracted data
Console.WriteLine("Invoice No.: " + invoiceNo)
Console.WriteLine("Invoice Date: " + invoiceDate)
Console.WriteLine("TOTAL: " + total)
Console.WriteLine("Table Data: ")
Console.WriteLine(tableData)
' Cleanup
textExtractor.Dispose()
xmlExtractor.Dispose()
Console.WriteLine()
Console.WriteLine("Press any key...")
Console.ReadKey()
End Sub
End Module
Resources.Designer.vb
'------------------------------------------------------------------------------
' <auto-generated>
' This code was generated by a tool.
' Runtime Version:4.0.30319.42000
'
' Changes to this file may cause incorrect behavior and will be lost if
' the code is regenerated.
' </auto-generated>
'------------------------------------------------------------------------------
Option Strict On
Option Explicit On
Namespace My.Resources
'This class was auto-generated by the StronglyTypedResourceBuilder
'class via a tool like ResGen or Visual Studio.
'To add or remove a member, edit your .ResX file then rerun ResGen
'with the /str option, or rebuild your VS project.
'''<summary>
''' A strongly-typed resource class, for looking up localized strings, etc.
'''</summary>
<Global.System.CodeDom.Compiler.GeneratedCodeAttribute("System.Resources.Tools.StronglyTypedResourceBuilder", "4.0.0.0"), _
Global.System.Diagnostics.DebuggerNonUserCodeAttribute(), _
Global.System.Runtime.CompilerServices.CompilerGeneratedAttribute(), _
Global.Microsoft.VisualBasic.HideModuleNameAttribute()> _
Friend Module Resources
Private resourceMan As Global.System.Resources.ResourceManager
Private resourceCulture As Global.System.Globalization.CultureInfo
'''<summary>
''' Returns the cached ResourceManager instance used by this class.
'''</summary>
<Global.System.ComponentModel.EditorBrowsableAttribute(Global.System.ComponentModel.EditorBrowsableState.Advanced)> _
Friend ReadOnly Property ResourceManager() As Global.System.Resources.ResourceManager
Get
If Object.ReferenceEquals(resourceMan, Nothing) Then
Dim temp As Global.System.Resources.ResourceManager = New Global.System.Resources.ResourceManager("InvoiceParsing.Resources", GetType(Resources).Assembly)
resourceMan = temp
End If
Return resourceMan
End Get
End Property
'''<summary>
''' Overrides the current thread's CurrentUICulture property for all
''' resource lookups using this strongly typed resource class.
'''</summary>
<Global.System.ComponentModel.EditorBrowsableAttribute(Global.System.ComponentModel.EditorBrowsableState.Advanced)> _
Friend Property Culture() As Global.System.Globalization.CultureInfo
Get
Return resourceCulture
End Get
Set(ByVal value As Global.System.Globalization.CultureInfo)
resourceCulture = value
End Set
End Property
End Module
End Namespace
Settings.Designer.vb
'------------------------------------------------------------------------------
' <auto-generated>
' This code was generated by a tool.
' Runtime Version:4.0.30319.42000
'
' Changes to this file may cause incorrect behavior and will be lost if
' the code is regenerated.
' </auto-generated>
'------------------------------------------------------------------------------
Option Strict On
Option Explicit On
Namespace My
<Global.System.Runtime.CompilerServices.CompilerGeneratedAttribute(), _
Global.System.CodeDom.Compiler.GeneratedCodeAttribute("Microsoft.VisualStudio.Editors.SettingsDesigner.SettingsSingleFileGenerator", "11.0.0.0"), _
Global.System.ComponentModel.EditorBrowsableAttribute(Global.System.ComponentModel.EditorBrowsableState.Advanced)> _
Partial Friend NotInheritable Class MySettings
Inherits Global.System.Configuration.ApplicationSettingsBase
Private Shared defaultInstance As MySettings = CType(Global.System.Configuration.ApplicationSettingsBase.Synchronized(New MySettings), MySettings)
#Region "My.Settings Auto-Save Functionality"
#If _MyType = "WindowsForms" Then
Private Shared addedHandler As Boolean
Private Shared addedHandlerLockObject As New Object
<Global.System.Diagnostics.DebuggerNonUserCodeAttribute(), Global.System.ComponentModel.EditorBrowsableAttribute(Global.System.ComponentModel.EditorBrowsableState.Advanced)> _
Private Shared Sub AutoSaveSettings(ByVal sender As Global.System.Object, ByVal e As Global.System.EventArgs)
If My.Application.SaveMySettingsOnExit Then
My.Settings.Save()
End If
End Sub
#End If
#End Region
Public Shared ReadOnly Property [Default]() As MySettings
Get
#If _MyType = "WindowsForms" Then
If Not addedHandler Then
SyncLock addedHandlerLockObject
If Not addedHandler Then
AddHandler My.Application.Shutdown, AddressOf AutoSaveSettings
addedHandler = True
End If
End SyncLock
End If
#End If
Return defaultInstance
End Get
End Property
End Class
End Namespace
Namespace My
<Global.Microsoft.VisualBasic.HideModuleNameAttribute(), _
Global.System.Diagnostics.DebuggerNonUserCodeAttribute(), _
Global.System.Runtime.CompilerServices.CompilerGeneratedAttribute()> _
Friend Module MySettingsProperty
<Global.System.ComponentModel.Design.HelpKeywordAttribute("My.Settings")> _
Friend ReadOnly Property Settings() As Global.InvoiceParsing.My.MySettings
Get
Return Global.InvoiceParsing.My.MySettings.Default
End Get
End Property
End Module
End Namespace