#ifndef pdf_text_extractH
#define pdf_text_extractH

#include <stdio.h>
#include <stdlib.h>

#define PDF_STATIC // Enable static binding when compiling the project with the workspace dynapdf_static.
#include "../../../include/C_CPP/dynapdf.h"

using namespace DynaPDF;

/*
   This class uses the function GetPageText() to extract the text of a PDF file.
   It demonstrates how text lines and word boundaries can be identified. The
   algorithm handles rotated text as well as text lines which consist of multiple
   text records correctly.

   The identification of text lines and word boundaries is very important if
   you want to develop a text search or text replacement algorithm.

   The function GetPageText() should normally be used to replace or delete certain
   texts of a page. Take a look into the example edit_text to determine how texts
   can be replaced. Since the demo class CPDFEditText is already rather complex it
   is easier to understand how text lines can be constructed with a smaller example.

   GetPageText() uses internally the content parser of DynaPDF. If you just want to
   extract the contents of a PDF file it is better to use the content parser directly
   because it is faster in comparison to GetPageText(). Take a look into the
   examples text_extraction2 or text_search to determine how it can be used. However,
   it is currently not possible to combine the content parser with editing functions.
*/

// We need the list to store template handles.
template<class T>class CTList
{
  public:
   CTList(void) :
      m_Capacity(0),
      m_Count(0),
      m_Items(NULL)
   {}
   ~CTList(void)
   {
      if (m_Items) free(m_Items);
   }
   void Add(T &Value)
   {
      if (m_Count == m_Capacity)
      {
         T* tmp = (T*)realloc(m_Items, (m_Capacity + 64) * sizeof(T));
         if (!tmp) throw "Out of memory!";
         m_Items     = tmp;
         m_Capacity += 64;
      }
      m_Items[m_Count++] = Value;
   }
   void Clear(void)               {m_Count = 0;}
   UI32 Count(void)         const {return m_Count;}
   SI32 Find(T &Value)
   {
      SI32 i = 0;
      SI32 e = m_Count - 1;
      while (i <= e)
      {
         if (m_Items[i] == Value) return i;
         if (m_Items[e] == Value) return e;
         ++i;
         --e;
      }
      return -1;
   }
   T*   GetItem(UI32 Index) const {return (m_Items + Index);}
  private:
   UI32 m_Capacity;
   UI32 m_Count;
   T*   m_Items;
};

/*
   We assume that a text record lies on the same text line if the error on the y-axis is less
   than two units. A little error is always possible especially if the PDF file was created by
   a GDI printer driver because the reference point of text in the GDI is the upper left corner
   of the font's bounding box while it is the baseline in PDF. This difference causes often
   differences if a font is changed, e.g. from a regular style to italic or bold.
*/
#define MAX_LINE_ERROR 4.0 // This must be the square of the allowed error (2 * 2 in this case).

class CTextExtraction
{
  public:
   CTextExtraction(const void* PDFInst);
  ~CTextExtraction(void);
   void Close(void){if (m_File){fclose(m_File); m_File = NULL;}}
   void Open(const char* FileName);
   void ParsePage(void);
   void WritePageIdentifier(const wchar_t* Fmt, ...);
  protected:
   typedef enum
   {
      tfLeftToRight    = 0,
      tfRightToLeft    = 1,
      tfTopToBottom    = 2,
      tfBottomToTop    = 4,
      tfNotInitialized = -1
   }TTextDir;

   FILE*        m_File;
   TTextDir     m_LastTextDir;
   double       m_LastTextEndX;
   double       m_LastTextEndY;
   double       m_LastTextInfX;
   double       m_LastTextInfY;
   UI32         m_PageCount;
   const void*  m_PDF;
   TPDFStack    m_Stack;
   CTList<SI32> m_Templates;

   SI32   AddText(void);
   double CalcDistance(double x1, double y1, double x2, double y2);
   bool   IsPointOnLine(double x, double y, double x0, double y0, double x1, double y1);
   TCTM   MulMatrix(TCTM &M1, TCTM &M2);
   void   ParseTemplates(void);
   void   ParseText(void);
   void   Transform(TCTM &M, double &x, double &y);
};

#endif
