PDF Toolkit

Samples

Extract text from PDF

This Code sample requires the PDF Toolkit and is found in: C:\Aquaforest\PDF Toolkit\samples\ExtractTextFromPDF

using Aquaforest.PDF;
using System;
using System.Collections.Generic;
using System.Drawing;
    
namespace ExtractTextFromPDF
{
    internal class ExtractTextFromPDF
    {
        private static void Main(string[] args)
        {
            //Assign Licence Key
            PDFToolkit.LicenseKey = string.Empty;
            string inputFile = @"..\documents\source\cookbook.pdf";
            string outputFile = @"..\documents\output\hocr_ouput";
    
            //Get and print text from page 1
            string pageOne = GetTextFromPage(inputFile, 1);
            Console.WriteLine(pageOne);
            Console.WriteLine();
    
            //Get and print text from whole document
            string wholeDocumet = GetTextFromPDFFile(inputFile);
            Console.WriteLine(wholeDocumet);
            Console.WriteLine();
    
            //Get and print text from page 2 by rect
            Rectangle rect = new Rectangle { Height = 200, Width = 200, X = 10, Y = 10 };
            string rectText = GetTextFromAreaByRect(inputFile, rect, 2);
            Console.WriteLine(rectText);
            Console.WriteLine();
    
            //Get and print text from page 2 by coords
    
            string coordText = GetTextFromAreaByCordinates(inputFile, 10, 10, 200, 200, 2);
            Console.WriteLine(coordText);
            Console.WriteLine();
    
            //Get Document Word Data
            var worData = GetTextWithCordinatesAndFonts(inputFile);
    
            //Save Text in file as hocr file
            SaveTextAsHOCRFiles(inputFile, outputFile);
        }
    
        private static string GetTextFromPage(string doc, int pageNumber)
        {
            string text = string.Empty;
            try
            {
                PDFDocument pdfDoc = new PDFDocument(doc);
                text = pdfDoc.GetText(pageNumber);
            }
            catch (Exception)
            {
            }
            return text;
        }
    
        private static string GetTextFromPDFFile(string doc)
        {
            string text = string.Empty;
            try
            {
                PDFDocument pdfDoc = new PDFDocument(doc);
                text = pdfDoc.GetText();
            }
            catch (Exception)
            {
            }
            return text;
        }
    
        private static string GetTextFromAreaByCordinates(string doc, double x, double y, double width, double height, int pageNumber)
        {
            string text = string.Empty;
            try
            {
                PDFDocument pdfDoc = new PDFDocument(doc);
                text = pdfDoc.GetTextByArea(x, y, width, height, pageNumber);
            }
            catch (Exception)
            {
            }
            return text;
        }
    
        private static string GetTextFromAreaByRect(string doc, Rectangle rect, int pageNumber)
        {
            string text = string.Empty;
            try
            {
                PDFDocument pdfDoc = new PDFDocument(doc);
                text = pdfDoc.GetTextByArea(rect, pageNumber);
            }
            catch (Exception)
            {
            }
            return text;
        }
    
        private static void SaveTextAsHOCRFiles(string doc, string output)
        {
            try
            {
                PDFDocument pdfDoc = new PDFDocument(doc);
                pdfDoc.GenerateHocrFromText(output, false);
            }
            catch (Exception)
            {
            }
        }
    
        private static List GetTextWithCordinatesAndFonts(string doc)
        {
            try
            {
                PDFDocument pdfDoc = new PDFDocument(doc);
                return pdfDoc.GetDocumentWordData();
            }
            catch (Exception)
            {
                return null;
            }
        }
    }
}