#include "pdf_text_extract.h"

#include <math.h>
#include <memory.h>
#include <stdarg.h>

#if defined(_MSC_VER)
   #pragma warning(disable : 4514)
#endif

CTextExtraction::CTextExtraction(const PPDF* PDFInst) :
   m_File(NULL),
   m_LastTextDir(tfNotInitialized),
   m_LastTextEndX(0.0),
   m_LastTextEndY(0.0),
   m_LastTextInfX(0.0),
   m_LastTextInfY(0.0),
   m_PageCount(0),
   m_PDF(PDFInst)
{}

CTextExtraction::~CTextExtraction(void)
{
   if (m_File) fclose(m_File);
}

SI32 CTextExtraction::AddText(void)
{
   UI16 space[] = {32};
   UI16 newLine[] = {13, 10};
   TTextDir textDir;
   double x1 = 0.0;
   double y1 = 0.0;
   double x2 = 0.0;
   double y2 = m_Stack.FontSize;
   // Transform the text matrix to user space
   TCTM m = MulMatrix(m_Stack.ctm, m_Stack.tm);
   // Start point of the text record
   Transform(m, x1, y1);
   // The second point to determine the text direction can also be used to calculate
   // the visible font size measured in user space:
   //   double realFontSize = CalcDistance(x1, y1, x2, y2);

   Transform(m, x2, y2);
   // Determine the text direction
   if (y1 == y2)
      textDir = (TTextDir)(((x1 > x2) + 1) << 1);
   else
      textDir = (TTextDir)(y1 > y2);

   // Wrong direction or not on the same text line?
   if (textDir != m_LastTextDir || !IsPointOnLine(x1, y1, m_LastTextEndX, m_LastTextEndY, m_LastTextInfX, m_LastTextInfY))
   {
      // Extend the x-coordinate to an infinite point.
      m_LastTextInfX = 1000000.0;
      m_LastTextInfY = 0.0;
      Transform(m, m_LastTextInfX, m_LastTextInfY);
      if (m_LastTextDir != tfNotInitialized)
      {
         // Add a new line to the output file
         fwrite(newLine, 2, 2, m_File);
      }
   }else
   {
      // The space width is measured in text space but the distance between two text
      // records is measured in user space! We must transform the space width to user
      // space before we can compare the values.
      double distance, spaceWidth;
      // Note that we use the full space width here because the end position of the last record
      // was set to the record width minus the half space width.
      double x3 = m_Stack.SpaceWidth;
      double y3 = 0.0;
      Transform(m, x3, y3);
      spaceWidth = CalcDistance(x1, y1, x3 ,y3);
      distance   = CalcDistance(m_LastTextEndX, m_LastTextEndY, x1, y1);
      if (distance > spaceWidth)
      {
         // Add a space to the output file
         fwrite(space, 2, 1, m_File);
      }
   }
   float spaceWidth = -m_Stack.SpaceWidth * 0.5f;
   if (m_Stack.FontSize < 0.0f)
      spaceWidth = -spaceWidth;
   for (UI32 i = 0; i < m_Stack.KerningCount; i++)
   {
      const TTextRecordW &rec = m_Stack.Kerning[i];
      if (rec.Advance < spaceWidth)
      {
         // Add a space to the output file
         fwrite(space, 2, 1, m_File);
      }
      fwrite(rec.Text, 2, rec.Length, m_File);
   }
   // We don't set the cursor to the real end of the string because applications like MS Word
   // add often a space to the end of a text record and this space can slightly overlap the next
   // record. IsPointOnLine() would return false if the new record overlaps the previous one.
   m_LastTextEndX = m_Stack.TextWidth + spaceWidth; // spaceWidth is a negative value!
   m_LastTextEndY = 0.0;
   m_LastTextDir  = textDir;
   // Calculate the end coordinate of the text record
   Transform(m, m_LastTextEndX, m_LastTextEndY);
   return 0;
}

inline double CTextExtraction::CalcDistance(double x1, double y1, double x2, double y2)
{
   double dx = x2-x1;
   double dy = y2-y1;
   return sqrt(dx*dx + dy*dy);
}

bool CTextExtraction::IsPointOnLine(double x, double y, double x0, double y0, double x1, double y1)
{
  double dx, dy, di;
  x -= x0;
  y -= y0;
  dx = x1 - x0;
  dy = y1 - y0;
  di = (x*dx + y*dy) / (dx*dx + dy*dy);
  di = (di < 0.0) ? 0.0 : (di > 1.0) ? 1.0 : di;
  dx = x - di * dx;
  dy = y - di * dy;
  di = dx*dx + dy*dy;
  return (di < MAX_LINE_ERROR);
}

inline TCTM CTextExtraction::MulMatrix(TCTM &M1, TCTM &M2)
{
   TCTM retval;
   retval.a = M2.a * M1.a + M2.b * M1.c;
   retval.b = M2.a * M1.b + M2.b * M1.d;
   retval.c = M2.c * M1.a + M2.d * M1.c;
   retval.d = M2.c * M1.b + M2.d * M1.d;
   retval.x = M2.x * M1.a + M2.y * M1.c + M1.x;
   retval.y = M2.x * M1.b + M2.y * M1.d + M1.y;
   return retval;
}

void CTextExtraction::Open(const char* FileName)
{
   m_File = fopen(FileName, "w+b");
   if (!m_File) throw "Cannot open file!";
   // Add a little endian identifier to the file
   fwrite("\377\376", 1, 2, m_File);
}

void CTextExtraction::ParsePage(void)
{
   m_Templates.Clear();
   if (!pdfInitStack(m_PDF, &m_Stack)) throw -1;
   m_LastTextEndX = 0.0;
   m_LastTextEndY = 0.0;
   m_LastTextDir  = tfNotInitialized;
   m_LastTextInfX = 0.0;
   m_LastTextInfY = 0.0;

   ParseText();
   ParseTemplates();
}

// Templates are parsed recursively.
void CTextExtraction::ParseTemplates(void)
{
   SI32 i, j, tmpl, tmplCount, tmplCount2;
   tmplCount = pdfGetTemplCount(m_PDF);
   for (i = 0; i < tmplCount; i++)
   {
      if (!pdfEditTemplate(m_PDF, i)) throw -1;
      // We must check whether the template was already processed. This is very
      // important to avoid an endless loop if a template references itself.
      tmpl = pdfGetTemplHandle(m_PDF);
      if (m_Templates.Find(tmpl) < 0)
      {
         m_Templates.Add(tmpl);
         if (!pdfInitStack(m_PDF, &m_Stack)) throw -1;

         ParseText();

         tmplCount2 = pdfGetTemplCount(m_PDF);
         for (j = 0; j < tmplCount2; j++)
         {
            ParseTemplates();
         }
         pdfEndTemplate(m_PDF);
      }else
         pdfEndTemplate(m_PDF);
   }
}

void CTextExtraction::ParseText(void)
{
   LBOOL haveMore;
   // Get the first text record if any
   haveMore = pdfGetPageText(m_PDF, &m_Stack);
   // No text found?
   if (!haveMore && !m_Stack.TextLen) return;
   AddText();
   if (haveMore)
   {
      while (pdfGetPageText(m_PDF, &m_Stack))
      {
         AddText();
      }
   }
}

inline void CTextExtraction::Transform(TCTM &M, double &x, double &y)
{
   double ox = x;
   x = ox * M.a + y * M.c + M.x;
   y = ox * M.b + y * M.d + M.y;
}

void CTextExtraction::WritePageIdentifier(const wchar_t* Fmt, ...)
{
   if (m_PageCount > 0)
   {
      UI16 newLine[] = {13, 10};
      // Add a new line to the output file
      fwrite(newLine, 2, 2, m_File);
   }
   va_list args;
   va_start(args, Fmt);
   vfwprintf(m_File, Fmt, args);
   va_end(args);
   ++m_PageCount;
}
