nnyverter.py 2.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566
  1. from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
  2. from pdfminer.converter import TextConverter
  3. from pdfminer.layout import LAParams
  4. from pdfminer.pdfpage import PDFPage
  5. from io import StringIO
  6. import re
  7. import csv
  8. import glob, os
  9. def convert_pdf_to_txt(path):
  10. rsrcmgr = PDFResourceManager()
  11. retstr = StringIO()
  12. codec = 'utf-8'
  13. laparams = LAParams()
  14. device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
  15. fp = open(path, 'rb')
  16. interpreter = PDFPageInterpreter(rsrcmgr, device)
  17. password = ""
  18. maxpages = 0
  19. caching = True
  20. pagenos=set()
  21. for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
  22. interpreter.process_page(page)
  23. text = retstr.getvalue()
  24. fp.close()
  25. device.close()
  26. retstr.close()
  27. return text
  28. regex = r"^Tour Nr:\n\n(\d*)\n\nAbfahrt:.*?StellPl.Maut.*?^(\d*(,\d*)?|0)\sKM$"
  29. os.chdir("./")
  30. print("moin")
  31. with open("ergebnis.csv", 'w', newline='') as csvfile:
  32. spamwriter = csv.writer(csvfile, delimiter=';',
  33. quotechar='|', quoting=csv.QUOTE_MINIMAL)
  34. for file in glob.glob("*.pdf"):
  35. daily = 0.0
  36. print("Beginne mit " + file + " alles guddes!")
  37. spamwriter.writerow([str(file)])
  38. out = convert_pdf_to_txt(file)
  39. file = open("testfile.txt","w", encoding='utf-8')
  40. file.write(out)
  41. file.close()
  42. matches = re.finditer(regex, out, re.MULTILINE | re.DOTALL)
  43. i = 1
  44. for matchNum, match in enumerate(matches, start=1):
  45. ##print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group()))
  46. spamwriter.writerow([str("=\"" + match.group(1) + "\"")] + [str(match.group(2)).replace(",",",")])
  47. print ("Tour " + str(i) + "," + str(match.group(1)) + "," + str(match.group(2)).replace(",","."))
  48. daily = daily + float(match.group(2).replace(",","."))
  49. i = i+1
  50. print(daily)
  51. spamwriter.writerow([])
  52. print("ok")