C# - GetPDFContentString


SUBMITTED BY: TheSwarm

DATE: Oct. 21, 2015, 8:34 a.m.

FORMAT: Text only

SIZE: 1.0 kB

HITS: 1664

  1. String GetPDFContentString(String path)
  2. {
  3. var sb = new StringBuilder();
  4. var reader = new PdfReader(path);
  5. {
  6. for (Int32 i = 1; i <= reader.NumberOfPages; i++)
  7. {
  8. PdfDictionary pdfDictionary = reader.GetPageN(i);
  9. PRIndirectReference objectRef;
  10. var PName = PdfName.CONTENTS;
  11. try
  12. {
  13. objectRef = pdfDictionary.Get(PName) as PRIndirectReference;
  14. }
  15. catch (Exception ex)
  16. {
  17. return ex.Message;
  18. }
  19. var stream = PdfReader.GetPdfObject(objectRef) as PRStream;
  20. byte[] SBytes = PdfReader.GetStreamBytes(stream);
  21. var token = new PRTokeniser(SBytes);
  22. while ((token.NextToken()))
  23. switch (token.TokenType)
  24. {
  25. case PRTokeniser.TK_STRING:
  26. sb.Append(token.StringValue);
  27. break;
  28. case PRTokeniser.TK_OTHER:
  29. switch (token.StringValue)
  30. {
  31. case "ET":
  32. sb.Append("\n");
  33. break;
  34. }
  35. break;
  36. }
  37. token = null;
  38. stream = null;
  39. objectRef = null;
  40. }
  41. }
  42. reader = null;
  43. return sb.ToString();

comments powered by Disqus