2.11. Read PDF

2.11.1. Use Case - 1

>>> 
... """
... >>> import sys, importlib
... >>> assert sys.version_info >= (3, 11), 'Python 3.11+ required'
...
... >>> try:
... ...     _ = importlib.import_module('tabula')
... ... except Exception as err:
... ...     print('pip install tabula-py')
... """
...
... import pandas as pd
... import tabula
...
...
... pd.set_option('display.width', 250)
... pd.set_option('display.max_columns', 15)
... pd.set_option('display.max_rows', 200)
...
... def clean(df):
...     return (
...         df
...         .convert_dtypes()
...         .rename(axis='columns', to_rename={
...             'Unnamed: 0': 'Number',
...             'Item': 'License_plate',
...             'Course date': 'Date',
...             'Amount charged PLN': 'Amount'})
...         .drop(columns='Lp')
...         .iloc[:-1]
...         .transform({
...             'Data': lambda column: pd.to_datetime(column, format='%d.%m.%Y', errors='coerce'),
...             'Kwota': lambda column: column.str.replace(',', '.').astype('float'),
...             'Rejestracja': lambda column: column.str.extract(r'VRM: ([A-Z0-9a-z]+)Identy')})
...         .fillna(method='ffill')
...         .droplevel(1, axis='columns')
...         .dropna()
...         .set_index('Data'))
...
...
... if __name__ == '__main__':
...     dfs = tabula.read_pdf('myfile.pdf', pages='all')
...     df = pd.concat(map(clean, dfs[4:8])).sort_index()