| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667 |
- from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
- from pdfminer.converter import TextConverter
- from pdfminer.layout import LAParams
- from pdfminer.pdfpage import PDFPage
- from io import StringIO
- import re
- import csv
- import glob, os
- def convert_pdf_to_txt(path):
- rsrcmgr = PDFResourceManager()
- retstr = StringIO()
- codec = 'utf-8'
- laparams = LAParams()
- device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
- fp = open(path, 'rb')
- interpreter = PDFPageInterpreter(rsrcmgr, device)
- password = ""
- maxpages = 0
- caching = True
- pagenos=set()
- for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
- interpreter.process_page(page)
- text = retstr.getvalue()
- fp.close()
- device.close()
- retstr.close()
- return text
- regex = r"^Tour Nr:\n\n(\d*)\n\nAbfahrt:.*?StellPl.Maut.*?^(\d*(,\d*)?|0)\sKM$"
- os.chdir("./")
- print("moin")
- with open("ergebnis.csv", 'w', newline='') as csvfile:
- spamwriter = csv.writer(csvfile, delimiter=';',
- quotechar='|', quoting=csv.QUOTE_MINIMAL)
- for file in glob.glob("*.pdf"):
- daily = 0.0
- print("Beginne mit " + file + " alles guddes!")
- spamwriter.writerow([str(file)])
- out = convert_pdf_to_txt(file)
- file = open("testfile.txt","w", encoding='utf-8')
-
- file.write(out)
-
-
- file.close()
- matches = re.finditer(regex, out, re.MULTILINE | re.DOTALL)
-
- i = 1
- for matchNum, match in enumerate(matches, start=1):
- ##print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group()))
- spamwriter.writerow([str("=\"" + match.group(1) + "\"")] + [str(match.group(2)).replace(",",",")])
- print ("Tour " + str(i) + "," + str(match.group(1)) + "," + str(match.group(2)).replace(",","."))
- daily = daily + float(match.group(2).replace(",","."))
- i = i+1
- print(daily)
- spamwriter.writerow(["Gesamt"] + [str("=\"" + str(round(daily, 2)) + "\"")])
- spamwriter.writerow([])
- print("ok")
|