from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfpage import PDFPage from io import StringIO import re import csv import glob, os def convert_pdf_to_txt(path): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = open(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() return text regex = r"^Tour Nr:\n\n(\d*)\n\nAbfahrt:.*?(?!(StellPl.|StellPl))\nMaut.*?^(\d*(,\d*)?|0)\sKM$" total = 0.0 os.chdir("./") print("moin") with open("ergebnis.csv", 'w', newline='') as csvfile: spamwriter = csv.writer(csvfile, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL) for file in glob.glob("*.pdf"): daily = 0.0 print("Beginne mit " + file + " alles guddes!") spamwriter.writerow([str(file)]) out = convert_pdf_to_txt(file) #file = open("testfile.txt","w", encoding='utf-8') #file.write(out) #file.close() matches = re.finditer(regex, out, re.MULTILINE | re.DOTALL) i = 1 for matchNum, match in enumerate(matches, start=1): ##print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group())) spamwriter.writerow([str("=\"" + match.group(1) + "\"")] + [str(match.group(3)).replace(",",",")]) print ("Tour " + str(i) + "," + str(match.group(1)) + "," + str(match.group(3)).replace(",",".")) daily = daily + float(match.group(3).replace(",",".")) i = i+1 print(daily) total = total + daily spamwriter.writerow(["Gesamt (Tag)"] + [str("=\"" + str(round(daily, 2)) + "\"")]) spamwriter.writerow([]) print("ok") spamwriter.writerow(["Gesamt (Monat)"] + [str("=\"" + str(round(total, 2)) + "\"")])