Link Search Menu Expand Document

PDF Invoice Parsing - VB.NET

PDF Extractor SDK sample in VB.NET demonstrating ‘PDF Invoice Parsing’

Module1.vb
Imports System.Drawing
Imports Bytescout.PDFExtractor

Module Module1

    Sub Main()

        ' Create TextExtractor instance
        Dim textExtractor = New TextExtractor("demo", "demo")
        textExtractor.WordMatchingMode = WordMatchingMode.ExactMatch ' Set exact search (default is SmartSearch that works like in Adobe Reader)

        ' Create XMLExtractor instance
        Dim xmlExtractor = New XMLExtractor("demo", "demo")

        ' Load document
        textExtractor.LoadDocumentFromFile("Invoice.pdf")
        xmlExtractor.LoadDocumentFromFile("Invoice.pdf")

        ' Results
        Dim invoiceNo = String.Empty
        Dim invoiceDate = String.Empty
        Dim total = String.Empty
        Dim tableData = String.Empty

        ' Iterate pages
        For i As Integer = 0 To textExtractor.GetPageCount() - 1

            Dim pageRectangle = textExtractor.GetPageRectangle(i)
            Dim tableRect = New RectangleF(0, 0, pageRectangle.Width, 0)

            ' Search for "Invoice No."
            If textExtractor.Find(i, "Invoice No.", False) Then
                ' Get the found text rectangle
                Dim textRect = textExtractor.FoundText.Bounds
                ' Assume the text at right is the invoice number.
                ' Shift the rectangle to the right:
                textRect.X = textRect.Right
                textRect.Width = pageRectangle.Right - textRect.Left
                ' Set the extraction region and extract the text
                textExtractor.SetExtractionArea(textRect)
                invoiceNo = textExtractor.GetTextFromPage(i).Trim()
            End If

            ' Search for "Invoice Date" and extract text at right
            If textExtractor.Find(i, "Invoice Date", False) Then
                Dim textRect = textExtractor.FoundText.Bounds
                textRect.X = textRect.Right
                textRect.Width = pageRectangle.Right - textRect.Left
                textExtractor.SetExtractionArea(textRect)
                invoiceDate = textExtractor.GetTextFromPage(i).Trim()
            End If

            ' Search for "Quantity" keyword to detect the top of the tabular data rectangle
            If textExtractor.Find(i, "Quantity", False) Then
                ' Keep the top table coordinate
                tableRect.Y = textExtractor.FoundText.Bounds.Top ' use textRect.Bottom if you want to skip column headers
            End If

            ' Search and extract "TOTAL" (it will be also the bottom of tabular data rectangle)
            If textExtractor.Find(i, "TOTAL", True) Then ' case sensitive! 

                Dim textRect = textExtractor.FoundText.Bounds
                textRect.X = textRect.Right
                textRect.Width = pageRectangle.Right - textRect.Left
                textExtractor.SetExtractionArea(textRect)
                total = textExtractor.GetTextFromPage(i).Trim()

                ' Calculate the table height
                tableRect.Height = textRect.Top - tableRect.Top
            End If

            ' Extract tabular data using XMLExtractor
            If tableRect.Height > 0 Then
                xmlExtractor.SetExtractionArea(tableRect)
                tableData = xmlExtractor.GetXMLFromPage(i)
            End If
        Next

        ' Display extracted data
        Console.WriteLine("Invoice No.: " + invoiceNo)
        Console.WriteLine("Invoice Date: " + invoiceDate)
        Console.WriteLine("TOTAL: " + total)
        Console.WriteLine("Table Data: ")
        Console.WriteLine(tableData)

        ' Cleanup
        textExtractor.Dispose()
        xmlExtractor.Dispose()


        Console.WriteLine()
        Console.WriteLine("Press any key...")
        Console.ReadKey()

    End Sub

End Module

Resources.Designer.vb
'------------------------------------------------------------------------------
' <auto-generated>
'     This code was generated by a tool.
'     Runtime Version:4.0.30319.42000
'
'     Changes to this file may cause incorrect behavior and will be lost if
'     the code is regenerated.
' </auto-generated>
'------------------------------------------------------------------------------

Option Strict On
Option Explicit On


Namespace My.Resources
    
    'This class was auto-generated by the StronglyTypedResourceBuilder
    'class via a tool like ResGen or Visual Studio.
    'To add or remove a member, edit your .ResX file then rerun ResGen
    'with the /str option, or rebuild your VS project.
    '''<summary>
    '''  A strongly-typed resource class, for looking up localized strings, etc.
    '''</summary>
    <Global.System.CodeDom.Compiler.GeneratedCodeAttribute("System.Resources.Tools.StronglyTypedResourceBuilder", "4.0.0.0"), _
     Global.System.Diagnostics.DebuggerNonUserCodeAttribute(), _
     Global.System.Runtime.CompilerServices.CompilerGeneratedAttribute(), _
     Global.Microsoft.VisualBasic.HideModuleNameAttribute()> _
    Friend Module Resources

        Private resourceMan As Global.System.Resources.ResourceManager

        Private resourceCulture As Global.System.Globalization.CultureInfo

        '''<summary>
        '''  Returns the cached ResourceManager instance used by this class.
        '''</summary>
        <Global.System.ComponentModel.EditorBrowsableAttribute(Global.System.ComponentModel.EditorBrowsableState.Advanced)> _
        Friend ReadOnly Property ResourceManager() As Global.System.Resources.ResourceManager
            Get
                If Object.ReferenceEquals(resourceMan, Nothing) Then
                    Dim temp As Global.System.Resources.ResourceManager = New Global.System.Resources.ResourceManager("InvoiceParsing.Resources", GetType(Resources).Assembly)
                    resourceMan = temp
                End If
                Return resourceMan
            End Get
        End Property

        '''<summary>
        '''  Overrides the current thread's CurrentUICulture property for all
        '''  resource lookups using this strongly typed resource class.
        '''</summary>
        <Global.System.ComponentModel.EditorBrowsableAttribute(Global.System.ComponentModel.EditorBrowsableState.Advanced)> _
        Friend Property Culture() As Global.System.Globalization.CultureInfo
            Get
                Return resourceCulture
            End Get
            Set(ByVal value As Global.System.Globalization.CultureInfo)
                resourceCulture = value
            End Set
        End Property
    End Module
End Namespace

Settings.Designer.vb
'------------------------------------------------------------------------------
' <auto-generated>
'     This code was generated by a tool.
'     Runtime Version:4.0.30319.42000
'
'     Changes to this file may cause incorrect behavior and will be lost if
'     the code is regenerated.
' </auto-generated>
'------------------------------------------------------------------------------

Option Strict On
Option Explicit On


Namespace My

    <Global.System.Runtime.CompilerServices.CompilerGeneratedAttribute(), _
     Global.System.CodeDom.Compiler.GeneratedCodeAttribute("Microsoft.VisualStudio.Editors.SettingsDesigner.SettingsSingleFileGenerator", "11.0.0.0"), _
     Global.System.ComponentModel.EditorBrowsableAttribute(Global.System.ComponentModel.EditorBrowsableState.Advanced)> _
    Partial Friend NotInheritable Class MySettings
        Inherits Global.System.Configuration.ApplicationSettingsBase

        Private Shared defaultInstance As MySettings = CType(Global.System.Configuration.ApplicationSettingsBase.Synchronized(New MySettings), MySettings)

#Region "My.Settings Auto-Save Functionality"
#If _MyType = "WindowsForms" Then
        Private Shared addedHandler As Boolean

        Private Shared addedHandlerLockObject As New Object

        <Global.System.Diagnostics.DebuggerNonUserCodeAttribute(), Global.System.ComponentModel.EditorBrowsableAttribute(Global.System.ComponentModel.EditorBrowsableState.Advanced)> _
        Private Shared Sub AutoSaveSettings(ByVal sender As Global.System.Object, ByVal e As Global.System.EventArgs)
            If My.Application.SaveMySettingsOnExit Then
                My.Settings.Save()
            End If
        End Sub
#End If
#End Region

        Public Shared ReadOnly Property [Default]() As MySettings
            Get

#If _MyType = "WindowsForms" Then
                   If Not addedHandler Then
                        SyncLock addedHandlerLockObject
                            If Not addedHandler Then
                                AddHandler My.Application.Shutdown, AddressOf AutoSaveSettings
                                addedHandler = True
                            End If
                        End SyncLock
                    End If
#End If
                Return defaultInstance
            End Get
        End Property
    End Class
End Namespace

Namespace My
    
    <Global.Microsoft.VisualBasic.HideModuleNameAttribute(),  _
     Global.System.Diagnostics.DebuggerNonUserCodeAttribute(),  _
     Global.System.Runtime.CompilerServices.CompilerGeneratedAttribute()>  _
    Friend Module MySettingsProperty
        
        <Global.System.ComponentModel.Design.HelpKeywordAttribute("My.Settings")>  _
        Friend ReadOnly Property Settings() As Global.InvoiceParsing.My.MySettings
            Get
                Return Global.InvoiceParsing.My.MySettings.Default
            End Get
        End Property
    End Module
End Namespace

Download Source Code (.zip)

Return to the previous page Explore PDF Extractor SDK


Copyright © 2016 - 2021 ByteScout