PDF Toolkit

Samples

Use the PDF Toolkit with Aquaforest's OCR SDK

The code below loop through all the PDF documents in a directory and check if they either contain text or are image-only. The image-only PDFs are OCRed and stamped. If the OCRed documents have more than one page, they are split into single pages. Note: In order for this example to work you will need to download the OCR SDK first and then reference Aquaforest.OCR.Api.

This Code sample requires the PDF Toolkit and OCR SDK, this sample is found in: C:\Aquaforest\PDF Toolkit\samples\PDFToolkitWithOCRSDK

using Aquaforest.OCR.Api;
using Aquaforest.PDF;
using System;
using System.IO;
using System.Drawing;
    
namespace PDFToolkitWithOCRSDK
{
    class Program
    {
        static void Main(string[] args)
        {
            DirectoryInfo directory = new DirectoryInfo(@"..\..\..\documents\source\tree");
            foreach (var file in directory.GetFiles("*.pdf", SearchOption.AllDirectories))
            {
                Console.WriteLine("Processing PDF file: {0}", file.FullName);
                PDFDocument doc = new PDFDocument(file.FullName);
                string docText = doc.GetText().Replace("\r\n", "").Replace("\n", "").Replace("\r", "");
                int numberOfPages = doc.NumberOfPages;
                doc.Close();
                // OCR document only if it is image only
                if (docText.Length == 0)
                {
                    Console.WriteLine("PDF file is image-only");
                    string outputDirectory = Path.Combine(@"..\..\..\documents\output", file.Directory.Name);
                    if (!Directory.Exists(outputDirectory))
                    Directory.CreateDirectory(outputDirectory);
                    string output = Path.Combine(outputDirectory, file.Name);
                    bool ocrSuccessful = OCRPDF(file.FullName, output);
                    if (ocrSuccessful)
                    {
                        AddStamp(output);
                        if (numberOfPages > 1)
                        {
                            SplitFile(output);
                        }
                    }
                }
                else
                {
                    Console.WriteLine("PDF file contains text");
                }
                Console.WriteLine("");
            }
        }
            
        static bool OCRPDF(string source, string output)
        {
            bool success = false;
            try
            {
                Ocr ocr = new Ocr();
                PreProcessor preProcessor = new PreProcessor();
                string OCRFiles = Path.GetFullPath(@"..\..\lib\");
                Environment.SetEnvironmentVariable("PATH", Environment.GetEnvironmentVariable("PATH") + ";" + OCRFiles);
                ocr.ResourceFolder = OCRFiles;
                ocr.EnableConsoleOutput = true;
                ocr.Language = SupportedLanguages.English;
                ocr.EnablePdfOutput = true;
                ocr.ReadPDFSource(source);
                preProcessor.Deskew = true;
                preProcessor.Autorotate = false;
                if (ocr.Recognize(preProcessor))
                {
                    ocr.SavePDFOutput(output, true);
                }
                ocr.DeleteTemporaryFiles();
                success = true;
            }
            catch (Exception e)
            {
                Console.WriteLine("Error in OCR Processing :" + e.Message);
                success = false;
            }
            return success;
        }
            
        static void AddStamp(string output)
        {
            Console.WriteLine("Adding stamp...");
            PDFDocument doc = new PDFDocument(output);
            PDFStamper stamper = new PDFStamper(doc, doc.FilePath);
            stamper.FontSize = 12;
            stamper.StampOpacity = 100;
            stamper.StampColor = Color.Black;
            stamper.StampPDFText("THIS IS A TEST STAMP", 200, 200);
        }
            
        static void SplitFile(string output)
        {
            Console.WriteLine("Splitting file...");
            PDFDocument doc = new PDFDocument(output);
            PDFSplitter splitter = new PDFSplitter(doc);
            splitter.OutputFileName = Path.GetFileNameWithoutExtension(output);
            splitter.OutputFilePath = Path.GetDirectoryName(output);
            splitter.SplitByRepeatingNumber(1, 5, 1);
        }
    }
}