Link Search Menu Expand Document

PDF Invoice Parsing - VB.NET

PDF Extractor SDK sample in VB.NET demonstrating ‘PDF Invoice Parsing’

Module1.vb
Imports System.Drawing
Imports Bytescout.PDFExtractor

Module Module1

    Sub Main()

        ' Create TextExtractor instance
        Dim textExtractor = New TextExtractor("demo", "demo")
        textExtractor.WordMatchingMode = WordMatchingMode.ExactMatch ' Set exact search (default is SmartSearch that works like in Adobe Reader)

        ' Create XMLExtractor instance
        Dim xmlExtractor = New XMLExtractor("demo", "demo")

        ' Load document
        textExtractor.LoadDocumentFromFile("Invoice.pdf")
        xmlExtractor.LoadDocumentFromFile("Invoice.pdf")

        ' Results
        Dim invoiceNo = String.Empty
        Dim invoiceDate = String.Empty
        Dim total = String.Empty
        Dim tableData = String.Empty

        ' Iterate pages
        For i As Integer = 0 To textExtractor.GetPageCount() - 1

            Dim pageRectangle = textExtractor.GetPageRectangle(i)
            Dim tableRect = New RectangleF(0, 0, pageRectangle.Width, 0)

            ' Search for "Invoice No."
            If textExtractor.Find(i, "Invoice No.", False) Then
                ' Get the found text rectangle
                Dim textRect = textExtractor.FoundText.Bounds
                ' Assume the text at right is the invoice number.
                ' Shift the rectangle to the right:
                textRect.X = textRect.Right
                textRect.Width = pageRectangle.Right - textRect.Left
                ' Set the extraction region and extract the text
                textExtractor.SetExtractionArea(textRect)
                invoiceNo = textExtractor.GetTextFromPage(i).Trim()
            End If

            ' Search for "Invoice Date" and extract text at right
            If textExtractor.Find(i, "Invoice Date", False) Then
                Dim textRect = textExtractor.FoundText.Bounds
                textRect.X = textRect.Right
                textRect.Width = pageRectangle.Right - textRect.Left
                textExtractor.SetExtractionArea(textRect)
                invoiceDate = textExtractor.GetTextFromPage(i).Trim()
            End If

            ' Search for "Quantity" keyword to detect the top of the tabular data rectangle
            If textExtractor.Find(i, "Quantity", False) Then
                ' Keep the top table coordinate
                tableRect.Y = textExtractor.FoundText.Bounds.Top ' use textRect.Bottom if you want to skip column headers
            End If

            ' Search and extract "TOTAL" (it will be also the bottom of tabular data rectangle)
            If textExtractor.Find(i, "TOTAL", True) Then ' case sensitive! 

                Dim textRect = textExtractor.FoundText.Bounds
                textRect.X = textRect.Right
                textRect.Width = pageRectangle.Right - textRect.Left
                textExtractor.SetExtractionArea(textRect)
                total = textExtractor.GetTextFromPage(i).Trim()

                ' Calculate the table height
                tableRect.Height = textRect.Top - tableRect.Top
            End If

            ' Extract tabular data using XMLExtractor
            If tableRect.Height > 0 Then
                xmlExtractor.SetExtractionArea(tableRect)
                tableData = xmlExtractor.GetXMLFromPage(i)
            End If
        Next

        ' Display extracted data
        Console.WriteLine("Invoice No.: " + invoiceNo)
        Console.WriteLine("Invoice Date: " + invoiceDate)
        Console.WriteLine("TOTAL: " + total)
        Console.WriteLine("Table Data: ")
        Console.WriteLine(tableData)

        ' Cleanup
        textExtractor.Dispose()
        xmlExtractor.Dispose()


        Console.WriteLine()
        Console.WriteLine("Press any key...")
        Console.ReadKey()

    End Sub

End Module

Resources.Designer.vb
'------------------------------------------------------------------------------
' <auto-generated>
'     This code was generated by a tool.
'     Runtime Version:4.0.30319.42000
'
'     Changes to this file may cause incorrect behavior and will be lost if
'     the code is regenerated.
' </auto-generated>
'------------------------------------------------------------------------------

Option Strict On
Option Explicit On


Namespace My.Resources
    
    'This class was auto-generated by the StronglyTypedResourceBuilder
    'class via a tool like ResGen or Visual Studio.
    'To add or remove a member, edit your .ResX file then rerun ResGen
    'with the /str option, or rebuild your VS project.
    '''<summary>
    '''  A strongly-typed resource class, for looking up localized strings, etc.
    '''</summary>
    <Global.System.CodeDom.Compiler.GeneratedCodeAttribute("System.Resources.Tools.StronglyTypedResourceBuilder", "4.0.0.0"), _
     Global.System.Diagnostics.DebuggerNonUserCodeAttribute(), _
     Global.System.Runtime.CompilerServices.CompilerGeneratedAttribute(), _
     Global.Microsoft.VisualBasic.HideModuleNameAttribute()> _
    Friend Module Resources

        Private resourceMan As Global.System.Resources.ResourceManager

        Private resourceCulture As Global.System.Globalization.CultureInfo

        '''<summary>
        '''  Returns the cached ResourceManager instance used by this class.
        '''</summary>
        <Global.System.ComponentModel.EditorBrowsableAttribute(Global.System.ComponentModel.EditorBrowsableState.Advanced)> _
        Friend ReadOnly Property ResourceManager() As Global.System.Resources.ResourceManager
            Get
                If Object.ReferenceEquals(resourceMan, Nothing) Then
                    Dim temp As Global.System.Resources.ResourceManager = New Global.System.Resources.ResourceManager("InvoiceParsing.Resources", GetType(Resources).Assembly)
                    resourceMan = temp
                End If
                Return resourceMan
            End Get
        End Property

        '''<summary>
        '''  Overrides the current thread's CurrentUICulture property for all
        '''  resource lookups using this strongly typed resource class.
        '''</summary>
        <Global.System.ComponentModel.EditorBrowsableAttribute(Global.System.ComponentModel.EditorBrowsableState.Advanced)> _
        Friend Property Culture() As Global.System.Globalization.CultureInfo
            Get
                Return resourceCulture
            End Get
            Set(ByVal value As Global.System.Globalization.CultureInfo)
                resourceCulture = value
            End Set
        End Property
    End Module
End Namespace

Settings.Designer.vb
'------------------------------------------------------------------------------
' <auto-generated>
'     This code was generated by a tool.
'     Runtime Version:4.0.30319.42000
'
'     Changes to this file may cause incorrect behavior and will be lost if
'     the code is regenerated.
' </auto-generated>
'------------------------------------------------------------------------------

Option Strict On
Option Explicit On


Namespace My

    <Global.System.Runtime.CompilerServices.CompilerGeneratedAttribute(), _
     Global.System.CodeDom.Compiler.GeneratedCodeAttribute("Microsoft.VisualStudio.Editors.SettingsDesigner.SettingsSingleFileGenerator", "11.0.0.0"), _
     Global.System.ComponentModel.EditorBrowsableAttribute(Global.System.ComponentModel.EditorBrowsableState.Advanced)> _
    Partial Friend NotInheritable Class MySettings
        Inherits Global.System.Configuration.ApplicationSettingsBase

        Private Shared defaultInstance As MySettings = CType(Global.System.Configuration.ApplicationSettingsBase.Synchronized(New MySettings), MySettings)

#Region "My.Settings Auto-Save Functionality"
#If _MyType = "WindowsForms" Then
        Private Shared addedHandler As Boolean

        Private Shared addedHandlerLockObject As New Object

        <Global.System.Diagnostics.DebuggerNonUserCodeAttribute(), Global.System.ComponentModel.EditorBrowsableAttribute(Global.System.ComponentModel.EditorBrowsableState.Advanced)> _
        Private Shared Sub AutoSaveSettings(ByVal sender As Global.System.Object, ByVal e As Global.System.EventArgs)
            If My.Application.SaveMySettingsOnExit Then
                My.Settings.Save()
            End If
        End Sub
#End If
#End Region

        Public Shared ReadOnly Property [Default]() As MySettings
            Get

#If _MyType = "WindowsForms" Then
                   If Not addedHandler Then
                        SyncLock addedHandlerLockObject
                            If Not addedHandler Then
                                AddHandler My.Application.Shutdown, AddressOf AutoSaveSettings
                                addedHandler = True
                            End If
                        End SyncLock
                    End If
#End If
                Return defaultInstance
            End Get
        End Property
    End Class
End Namespace

Namespace My
    
    <Global.Microsoft.VisualBasic.HideModuleNameAttribute(),  _
     Global.System.Diagnostics.DebuggerNonUserCodeAttribute(),  _
     Global.System.Runtime.CompilerServices.CompilerGeneratedAttribute()>  _
    Friend Module MySettingsProperty
        
        <Global.System.ComponentModel.Design.HelpKeywordAttribute("My.Settings")>  _
        Friend ReadOnly Property Settings() As Global.InvoiceParsing.My.MySettings
            Get
                Return Global.InvoiceParsing.My.MySettings.Default
            End Get
        End Property
    End Module
End Namespace

Download Source Code (.zip)

Return to the previous page Explore PDF Extractor SDK