Link Search Menu Expand Document

Parse Simple Document - C#

Document Parser SDK sample in C# demonstrating ‘Parse Simple Document’

AmazonAWS.yml
templateName: Amazon Web Services Invoice
templateVersion: 4
templatePriority: 0
detectionRules:
  keywords:
  - Amazon Web Services
  - ATTN
  - Invoice
objects:
- name: total
  objectType: field
  fieldProperties:
    fieldType: macros
    expression: TOTAL AMOUNT DUE ON{{Anything}}{{Dollar}}({{Number}})
    regex: true
    dataType: decimal
- name: subTotal
  objectType: field
  fieldProperties:
    fieldType: macros
    expression: '{{LineStart}}{{Spaces}}Charges{{Spaces}}{{Dollar}}({{Number}})'
    regex: true
    dataType: decimal
- name: dateIssued
  objectType: field
  fieldProperties:
    fieldType: macros
    expression: Invoice Date:{{Spaces}}({{Anything}}){{LineEnd}}
    regex: true
    dataType: date
    dateFormat: MMMM d , yyyy
- name: invoiceId
  objectType: field
  fieldProperties:
    fieldType: macros
    expression: Invoice Number:{{Spaces}}({{Digits}})
    regex: true
- name: companyName
  objectType: field
  fieldProperties:
    fieldType: static
    expression: Amazon Web Services, Inc.
    regex: true
- name: companyWebsite
  objectType: field
  fieldProperties:
    fieldType: static
    expression: aws.amazon.com
    regex: true
- name: billTo
  objectType: field
  fieldProperties:
    fieldType: rectangle
    expression: Bill to Address:{{ToggleSingleLineMode}}({{AnythingGreedy}})
    regex: true
    rectangle:
    - 33
    - 115.5
    - 213.75
    - 72.75
    pageIndex: 0
- name: currency
  objectType: field
  fieldProperties:
    fieldType: static
    expression: USD
    regex: true
- name: table1
  objectType: table
  tableProperties:
    start:
      expression: '{{LineStart}}{{Spaces}}Detail{{LineEnd}}'
      regex: true
    end:
      expression: '{{EndOfPage}}'
      regex: true
    row:
      expression: '{{LineStart}}{{Spaces}}(?<description>{{SentenceWithSingleSpaces}}){{Spaces}}{{Dollar}}(?<unitPrice>{{Number}}){{LineEnd}}'
      regex: true
    columns:
    - name: unitPrice
      dataType: decimal


DigitalOcean.yml
templateName: DigitalOcean Invoice
templateVersion: 4
templatePriority: 0
detectionRules:
  keywords:
  - DigitalOcean
  - 101 Avenue of the Americas
  - Invoice Number
objects:
- name: companyName
  objectType: field
  fieldProperties:
    fieldType: static
    expression: DigitalOcean
    regex: true
- name: invoiceId
  objectType: field
  fieldProperties:
    fieldType: macros
    expression: 'Invoice Number: ({{Digits}})'
    regex: true
- name: dateIssued
  objectType: field
  fieldProperties:
    fieldType: macros
    expression: 'Date Issued: ({{SmartDate}})'
    regex: true
    dataType: date
    dateFormat: auto-mdy
- name: total
  objectType: field
  fieldProperties:
    fieldType: macros
    expression: 'Total: {{Dollar}}({{Number}})'
    regex: true
    dataType: decimal
- name: currency
  objectType: field
  fieldProperties:
    fieldType: static
    expression: USD
    regex: true
- name: table1
  objectType: table
  tableProperties:
    start:
      expression: Description{{Spaces}}Hours
      regex: true
    end:
      expression: 'Total:'
      regex: true
    row:
      expression: '{{LineStart}}{{Spaces}}(?<description>{{SentenceWithSingleSpaces}}){{Spaces}}(?<hours>{{Digits}}){{Spaces}}(?<start>{{2Digits}}{{Minus}}{{2Digits}}{{Space}}{{2Digits}}{{Colon}}{{2Digits}}){{Spaces}}(?<end>{{2Digits}}{{Minus}}{{2Digits}}{{Space}}{{2Digits}}{{Colon}}{{2Digits}}){{Spaces}}{{Dollar}}(?<unitPrice>{{Number}})'
      regex: true
    columns:
    - name: hours
      dataType: integer
    - name: unitPrice
      dataType: decimal


Google.yml
templateName: Google Invoice
templateVersion: 4
templatePriority: 0
detectionRules:
  keywords:
  - Google
  - 77-0493581
  - Invoice
objects:
- name: invoiceId
  objectType: field
  fieldProperties:
    expression: Invoice number:{{Spaces}}({{Digits}})
    regex: true
- name: dateIssued
  objectType: field
  fieldProperties:
    expression: Issue date:{{Spaces}}({{SmartDate}})
    regex: true
    dataType: date
    dateFormat: MMM d, yyyy
- name: total
  objectType: field
  fieldProperties:
    expression: Amount due in USD:{{Spaces}}{{Number}}
    regex: true
    dataType: decimal
- name: subTotal
  objectType: field
  fieldProperties:
    expression: Subtotal in USD:{{Spaces}}{{Number}}
    regex: true
    dataType: decimal
- name: taxRate
  objectType: field
  fieldProperties:
    expression: State sales tax {{OpeningParenthesis}}{{Digits}}{{Percent}}{{ClosingParenthesis}}
    regex: true
    dataType: integer
- name: tax
  objectType: field
  fieldProperties:
    expression: State sales tax{{Anything}}{{Number}}{{LineEnd}}
    regex: true
    dataType: decimal
- name: companyName
  objectType: field
  fieldProperties:
    fieldType: static
    expression: Google LLC
    regex: true
- name: billTo
  objectType: field
  fieldProperties:
    fieldType: rectangle
    regex: true
    rectangle:
    - 0
    - 152
    - 280
    - 72
    pageIndex: 0
- name: billingId
  objectType: field
  fieldProperties:
    expression: Billing ID:{{Spaces}}({{DigitsOrSymbols}})
    regex: true
- name: currency
  objectType: field
  fieldProperties:
    fieldType: static
    expression: USD
    regex: true
- name: table1
  objectType: table
  tableProperties:
    start:
      expression: Description{{Spaces}}Interval{{Spaces}}Quantity{{Spaces}}Amount
      regex: true
    end:
      expression: Subtotal in USD
      regex: true
    row:
      expression: '{{LineStart}}{{Spaces}}(?<description>{{SentenceWithSingleSpaces}}){{Spaces}}(?<interval>{{3Letters}}{{Space}}{{Digits}}{{Space}}{{Minus}}{{Space}}{{3Letters}}{{Space}}{{Digits}}){{Spaces}}(?<quantity>{{Digits}}){{Spaces}}(?<amount>{{Number}})'
      regex: true
    columns:
    - name: quantity
      dataType: integer
    - name: amount
      dataType: decimal


Program.cs
using System;
using System.Text;
using ByteScout.DocumentParser;

// This example demonstrates document data parsing to JSON, YAML and XML formats.

namespace GeneralExample
{
    class Program
    {
        static void Main(string[] args)
        {
            var templates = new string[] { "DigitalOcean.yml", "AmazonAWS.yml", "Google.yml" } ;

            string inputDocument1 = @".\DigitalOcean.pdf";
            string inputDocument2 = @".\AmazonAWS.pdf";
            string inputDocument3 = @".\Google.pdf";

            // Create DocumentParser instance
            using (DocumentParser documentParser = new DocumentParser("demo", "demo"))
            {
                Console.WriteLine($"Loading templates...");

                foreach (var template in templates)
                {
                    documentParser.AddTemplate(template);
                }

                Console.WriteLine($"Templates loaded.");
                Console.WriteLine();

                Console.WriteLine($"Parsing \"{inputDocument1}\"...");
                Console.WriteLine();

                // Parse document data in JSON format
                string jsonString = documentParser.ParseDocument(inputDocument1, OutputFormat.JSON);
                // Display parsed data in console
                Console.WriteLine("Parsing results in JSON format:");
                Console.WriteLine();
                Console.WriteLine(jsonString);

                Console.WriteLine();
                Console.WriteLine($"Parsing \"{inputDocument2}\"...");
                Console.WriteLine();

                // Parse document data in YAML format
                string yamlString = documentParser.ParseDocument(inputDocument2, OutputFormat.YAML);
                // Display parsed data in console
                Console.WriteLine("Parsing results in YAML format:");
                Console.WriteLine();
                Console.WriteLine(yamlString);
                
                Console.WriteLine();
                Console.WriteLine($"Parsing \"{inputDocument3}\"...");
                Console.WriteLine();

                // Parse document data in CSV format
                string xmlString = documentParser.ParseDocument(inputDocument3, OutputFormat.CSV, new CSVOptions
                {
                    GenerateColumnHeaders = true,
                    SeparatorCharacter = ",",
                    QuotationCharacter = "\"",
                    Encoding = Encoding.UTF8
                });

                // Display parsed data in console
                Console.WriteLine("Parsing results in XML format:");
                Console.WriteLine();
                Console.WriteLine(xmlString);
            }

            Console.WriteLine();
            Console.WriteLine("Press any key to continue...");
            Console.ReadLine();
        }
    }
}

Download Source Code (.zip)

Return to the previous page Explore Document Parser SDK


Copyright © 2016 - 2021 ByteScout