nnyverter.py 1.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647
  1. from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
  2. from pdfminer.converter import TextConverter
  3. from pdfminer.layout import LAParams
  4. from pdfminer.pdfpage import PDFPage
  5. from io import StringIO
  6. import re
  7. import csv
  8. def convert_pdf_to_txt(path):
  9. rsrcmgr = PDFResourceManager()
  10. retstr = StringIO()
  11. codec = 'utf-8'
  12. laparams = LAParams()
  13. device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
  14. fp = open(path, 'rb')
  15. interpreter = PDFPageInterpreter(rsrcmgr, device)
  16. password = ""
  17. maxpages = 0
  18. caching = True
  19. pagenos=set()
  20. for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
  21. interpreter.process_page(page)
  22. text = retstr.getvalue()
  23. fp.close()
  24. device.close()
  25. retstr.close()
  26. return text
  27. regex = r"^Tour Nr:\n\n(\d*)\n\nAbfahrt:.*?StellPl.Maut.*?^(\d*,\d*|0)\sKM$"
  28. out = convert_pdf_to_txt("./vp.pdf")
  29. matches = re.finditer(regex, out, re.MULTILINE | re.DOTALL)
  30. with open('eggs.csv', 'w', newline='') as csvfile:
  31. spamwriter = csv.writer(csvfile, delimiter=';',
  32. quotechar='|', quoting=csv.QUOTE_MINIMAL)
  33. i = 1
  34. for matchNum, match in enumerate(matches, start=1):
  35. ##print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group()))
  36. spamwriter.writerow([str("=\"" + match.group(1) + "\"")] + [str(match.group(2)).replace(",",".")])
  37. print ("Tour " + str(i) + "," + str(match.group(1)) + "," + str(match.group(2)).replace(",","."))
  38. i = i+1