Return to previous page Explore PDF Extractor SDK

PDF and OCR (Optical Character Recognition) - C++

PDF Extractor SDK sample in C++ demonstrating ‘PDF and OCR (Optical Character Recognition)’

View on GitHub Download Source Code (.zip)

CPPExample.cpp

	#include "stdafx.h"
	#include "comip.h"

	#import "c:\\Program Files\\Bytescout PDF Extractor SDK\\net4.00\\Bytescout.PDFExtractor.tlb" raw_interfaces_only

	using namespace Bytescout_PDFExtractor;

	int _tmain(int argc, _TCHAR* argv[])
	{
		// Initialize COM.
		HRESULT hr = CoInitializeEx(NULL, COINIT_APARTMENTTHREADED);

		// Create the interface pointer.
		_TextExtractorPtr pITextExtractor(__uuidof(TextExtractor));

		// Set the registration name and key
		// Note: You should use _bstr_t or BSTR to pass string to the library because of COM requirements
		_bstr_t bstrRegName(L"DEMO"); 
		pITextExtractor->put_RegistrationName(bstrRegName);
		
		_bstr_t bstrRegKey(L"DEMO");
		pITextExtractor->put_RegistrationKey(bstrRegKey);

		// Load sample PDF document
		_bstr_t bstrPath(L"..\\..\\sample_ocr.pdf");
		pITextExtractor->LoadDocumentFromFile(bstrPath);

		// Enable Optical Character Recognition (OCR)
		// in .Auto mode (SDK automatically checks if needs to use OCR or not)
		pITextExtractor->put_OCRMode(OCRMode_Auto);
		
		// Set the location of OCR language data files
		_bstr_t bstrOCRLangDataPath(L"c:\\Program Files\\Bytescout PDF Extractor SDK\\ocrdata");
		pITextExtractor->put_OCRLanguageDataFolder(bstrOCRLangDataPath);

		// Set OCR language
		_bstr_t bstrOCRLanguage(L"eng");
		pITextExtractor->put_OCRLanguage(bstrOCRLanguage);

		// Set PDF document rendering resolution
		pITextExtractor->put_OCRResolution(300);


		// You can also apply various preprocessing filters
		// to improve the recognition on low-quality scans.

		_ImagePreprocessingFiltersCollection* pIImagePreprocessingFilters;
		pITextExtractor->get_OCRImagePreprocessingFilters(&pIImagePreprocessingFilters);

		// Automatically deskew skewed scans
		//pIImagePreprocessingFilters->AddDeskew();

		// Remove vertical or horizontal lines (sometimes helps to avoid OCR engine's page segmentation errors)
		//pIImagePreprocessingFilters->AddVerticalLinesRemover();
		//pIImagePreprocessingFilters->AddHorizontalLinesRemover();

		// Repair broken letters
		//pIImagePreprocessingFilters->AddDilate();

		// Remove noise
		//pIImagePreprocessingFilters->AddMedian();

		// Apply Gamma Correction
		//pIImagePreprocessingFilters->AddGammaCorrection();

		// Add Contrast
		//pIImagePreprocessingFilters->AddContrast(20);


		// (!) You can use new OCRAnalyser class to find an optimal set of image preprocessing 
		// filters for your specific document.
		// See "OCR Analyser" example.


		// Save extracted text to file
		_bstr_t bstrOutputFile(L"output.txt");
		pITextExtractor->SaveTextToFile(bstrOutputFile);

		pITextExtractor->Release();

		CoUninitialize();

		return 0;
	}

stdafx.cpp

// stdafx.cpp : source file that includes just the standard includes
// CPPExample.pch will be the pre-compiled header
// stdafx.obj will contain the pre-compiled type information

#include "stdafx.h"

// TODO: reference any additional headers you need in STDAFX.H
// and not in this file

stdafx.h

// stdafx.h : include file for standard system include files,
// or project specific include files that are used frequently, but
// are changed infrequently
//

#pragma once

#include "targetver.h"

#include <stdio.h>
#include <tchar.h>



// TODO: reference additional headers your program requires here

targetver.h

#pragma once

// Including SDKDDKVer.h defines the highest available Windows platform.

// If you wish to build your application for a previous Windows platform, include WinSDKVer.h and
// set the _WIN32_WINNT macro to the platform you wish to support before including SDKDDKVer.h.

#include <SDKDDKVer.h>

PDF and OCR (Optical Character Recognition) - C++

CPPExample.cpp

stdafx.cpp

stdafx.h

targetver.h

Download Source Code (.zip)

Return to the previous page Explore PDF Extractor SDK