Forráskód Böngészése

'nnyverter.py' hinzufügen

tsi 5 éve
szülő
commit
c616493cbc
1 módosított fájl, 47 hozzáadás és 0 törlés
  1. 47 0
      nnyverter.py

+ 47 - 0
nnyverter.py

@@ -0,0 +1,47 @@
+from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
+from pdfminer.converter import TextConverter
+from pdfminer.layout import LAParams
+from pdfminer.pdfpage import PDFPage
+from io import StringIO
+import re
+import csv
+
+def convert_pdf_to_txt(path):
+    rsrcmgr = PDFResourceManager()
+    retstr = StringIO()
+    codec = 'utf-8'
+    laparams = LAParams()
+    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
+    fp = open(path, 'rb')
+    interpreter = PDFPageInterpreter(rsrcmgr, device)
+    password = ""
+    maxpages = 0
+    caching = True
+    pagenos=set()
+
+    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
+        interpreter.process_page(page)
+
+    text = retstr.getvalue()
+
+    fp.close()
+    device.close()
+    retstr.close()
+    return text
+
+regex = r"^Tour Nr:\n\n(\d*)\n\nAbfahrt:.*?StellPl.Maut.*?^(\d*,\d*|0)\sKM$"
+
+out = convert_pdf_to_txt("./vp.pdf")
+
+matches = re.finditer(regex, out, re.MULTILINE | re.DOTALL)
+with open('eggs.csv', 'w', newline='') as csvfile:
+    spamwriter = csv.writer(csvfile, delimiter=';',
+                            quotechar='|', quoting=csv.QUOTE_MINIMAL)
+
+    i = 1
+
+    for matchNum, match in enumerate(matches, start=1):
+        ##print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group()))
+        spamwriter.writerow([str("=\"" + match.group(1) + "\"")] + [str(match.group(2)).replace(",",".")])
+        print ("Tour " + str(i) + "," + str(match.group(1)) + "," + str(match.group(2)).replace(",","."))
+        i = i+1