Find Website Addresses in PDF with Regex - VB.NET
PDF Extractor SDK sample in VB.NET demonstrating ‘Find Website Addresses in PDF with Regex’
Program.vb
Imports Bytescout.PDFExtractor
' Note: if you are looking for a more higher level API to extract data from invoices, reports, statements
' then please check Document Parser SDK and Web API at https://bytescout.com/products/developer/documentparsersdk/index.html
' and https://pdf.co for secure and scalable web api
Module Program
Sub Main()
Try
' Create Bytescout.PDFExtractor.TextExtractor instance
Using extractor As TextExtractor = New TextExtractor()
extractor.RegistrationName = "demo"
extractor.RegistrationKey = "demo"
' Load sample PDF document
extractor.LoadDocumentFromFile("SampleInvoice.pdf")
' Enable the regular expression
extractor.RegexSearch = True
Dim pageCount As Integer = extractor.GetPageCount()
' Search through pages
For i As Integer = 0 To pageCount - 1
' Search website address in document
Dim regexPattern As String = "http(s)?://([\w-]+.)+[\w-]+(/[\w- ./?%&=])?"
' See the complete regular expressions reference at https://msdn.microsoft.com/en-us/library/az24scfc(v=vs.110).aspx
' Search each page for the pattern
If extractor.Find(i, regexPattern, False) Then
Do
' Iterate through each element in the found text
For Each element As ISearchResultElement In extractor.FoundText.Elements
Console.WriteLine("Found URL: " + element.Text)
Next
Loop While extractor.FindNext()
End If
Next
End Using
Catch ex As Exception
Console.WriteLine("Error: " & ex.Message)
End Try
Console.WriteLine()
Console.WriteLine("Press enter key to continue...")
Console.ReadLine()
End Sub
End Module