|
|
@@ -0,0 +1,47 @@
|
|
|
+from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
|
|
+from pdfminer.converter import TextConverter
|
|
|
+from pdfminer.layout import LAParams
|
|
|
+from pdfminer.pdfpage import PDFPage
|
|
|
+from io import StringIO
|
|
|
+import re
|
|
|
+import csv
|
|
|
+
|
|
|
+def convert_pdf_to_txt(path):
|
|
|
+ rsrcmgr = PDFResourceManager()
|
|
|
+ retstr = StringIO()
|
|
|
+ codec = 'utf-8'
|
|
|
+ laparams = LAParams()
|
|
|
+ device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
|
|
|
+ fp = open(path, 'rb')
|
|
|
+ interpreter = PDFPageInterpreter(rsrcmgr, device)
|
|
|
+ password = ""
|
|
|
+ maxpages = 0
|
|
|
+ caching = True
|
|
|
+ pagenos=set()
|
|
|
+
|
|
|
+ for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
|
|
|
+ interpreter.process_page(page)
|
|
|
+
|
|
|
+ text = retstr.getvalue()
|
|
|
+
|
|
|
+ fp.close()
|
|
|
+ device.close()
|
|
|
+ retstr.close()
|
|
|
+ return text
|
|
|
+
|
|
|
+regex = r"^Tour Nr:\n\n(\d*)\n\nAbfahrt:.*?StellPl.Maut.*?^(\d*,\d*|0)\sKM$"
|
|
|
+
|
|
|
+out = convert_pdf_to_txt("./vp.pdf")
|
|
|
+
|
|
|
+matches = re.finditer(regex, out, re.MULTILINE | re.DOTALL)
|
|
|
+with open('eggs.csv', 'w', newline='') as csvfile:
|
|
|
+ spamwriter = csv.writer(csvfile, delimiter=';',
|
|
|
+ quotechar='|', quoting=csv.QUOTE_MINIMAL)
|
|
|
+
|
|
|
+ i = 1
|
|
|
+
|
|
|
+ for matchNum, match in enumerate(matches, start=1):
|
|
|
+ ##print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group()))
|
|
|
+ spamwriter.writerow([str("=\"" + match.group(1) + "\"")] + [str(match.group(2)).replace(",",".")])
|
|
|
+ print ("Tour " + str(i) + "," + str(match.group(1)) + "," + str(match.group(2)).replace(",","."))
|
|
|
+ i = i+1
|