| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647 |
- from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
- from pdfminer.converter import TextConverter
- from pdfminer.layout import LAParams
- from pdfminer.pdfpage import PDFPage
- from io import StringIO
- import re
- import csv
- def convert_pdf_to_txt(path):
- rsrcmgr = PDFResourceManager()
- retstr = StringIO()
- codec = 'utf-8'
- laparams = LAParams()
- device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
- fp = open(path, 'rb')
- interpreter = PDFPageInterpreter(rsrcmgr, device)
- password = ""
- maxpages = 0
- caching = True
- pagenos=set()
- for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
- interpreter.process_page(page)
- text = retstr.getvalue()
- fp.close()
- device.close()
- retstr.close()
- return text
- regex = r"^Tour Nr:\n\n(\d*)\n\nAbfahrt:.*?StellPl.Maut.*?^(\d*,\d*|0)\sKM$"
- out = convert_pdf_to_txt("./vp.pdf")
- matches = re.finditer(regex, out, re.MULTILINE | re.DOTALL)
- with open('eggs.csv', 'w', newline='') as csvfile:
- spamwriter = csv.writer(csvfile, delimiter=';',
- quotechar='|', quoting=csv.QUOTE_MINIMAL)
- i = 1
- for matchNum, match in enumerate(matches, start=1):
- ##print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group()))
- spamwriter.writerow([str("=\"" + match.group(1) + "\"")] + [str(match.group(2)).replace(",",".")])
- print ("Tour " + str(i) + "," + str(match.group(1)) + "," + str(match.group(2)).replace(",","."))
- i = i+1
|