PDF To JSON - Powershell
PDF Extractor SDK sample in Powershell demonstrating ‘PDF To JSON’
pdf-to-json.bat
@echo off
if "%~1"=="" (
echo -----------------------------------------------------
echo Invalid parameter!
echo -----------------------------------------------------
echo Usage: pdf-to-json.bat folder_path
echo Example: pdf-to-json.bat "c:\documents"
echo -----------------------------------------------------
if not "%NOPAUSE%"=="1" pause
exit /b 1
)
powershell -NoProfile -ExecutionPolicy Bypass -Command "& .\pdf-to-json.ps1" "%1"
echo Script finished with errorlevel=%errorlevel%
pause
pdf-to-json.ps1
Param(
[Parameter(Mandatory = $true)]
[string] $InputFolder = ""
)
# Add reference to Bytescout.PDFExtractor.dll assembly
Add-Type -Path "c:\Program Files\Bytescout PDF Extractor SDK\net4.00\Bytescout.PDFExtractor.dll"
# Check input folder exists
if ((Test-Path $InputFolder) -eq $false) {
Write-Host "Target folder does not exist." -ForegroundColor Red
exit 0
}
# Create and activate JSONExtractor instance
$jsonExtractor = New-Object Bytescout.PDFExtractor.JSONExtractor
$jsonExtractor.RegistrationName = "demo"
$jsonExtractor.RegistrationKey = "demo"
try {
# Get PDF files from input folder
$files = Get-ChildItem -Path $InputFolder -Recurse -Include "*.pdf"
foreach ($file in $files) {
Write-Host "Input file" $file.FullName
# Construct output file name
$jsonFileName = [System.IO.Path]::ChangeExtension($file.FullName, "json")
Write-Host " Output file" $jsonFileName
# Load PDF document
$jsonExtractor.LoadDocumentFromFile($file.FullName)
# Disable the formatting reconstruction
$jsonExtractor.PreserveFormattingOnTextExtraction = $false
# Extract first page to JSON
$jsonExtractor.SaveJSONToFile(0, $jsonFileName)
# Reset extractor
$jsonExtractor.Reset()
}
}
catch {
Write-Host $_.Exception.Message
}
$jsonExtractor.Dispose()