python - Values from a class instance's attribute being added to a different instance of the same class -
this question has answer here:
i'm parsing pdfs extract table data using pdftable class. when create class instance create class instance seems first class instance file_1.cells being prepended second class instance file_2.cells. cannot figure out why happening don't think i'm creating class variables instance variables. reason data set_cells persisted when class instance instantiated. happening?
from pdfminer.pdfdocument import pdfdocument pdfminer.pdfpage import pdfpage pdfminer.pdfparser import pdfparser pdfminer.pdfinterp import pdfresourcemanager, pdfpageinterpreter pdfminer.converter import pdfpageaggregator pdfminer.layout import laparams, lttextbox, lttextboxhorizontal, lttextlinehorizontal tabulate import tabulate utils import clean_string collections import namedtuple class pdftable(object): def __init__(self, file_name): self.file_name = file_name self.table_headers = none self.cells = none self.rows = none def process_file(self, file_name): pages = [] open(file_name, 'rb') fp: parser = pdfparser(fp) doc = pdfdocument(parser) rsrcmgr = pdfresourcemanager() laparams = laparams() device = pdfpageaggregator(rsrcmgr, laparams=laparams) interpreter = pdfpageinterpreter(rsrcmgr, device) page in pdfpage.create_pages(doc): interpreter.process_page(page) pages.append(device.get_result()) return pages def set_table_headers(self, page_obj, table_headers={}): values = ('name', 'value', 'reference range') header = namedtuple('header', 'name, x0, y0') obj in page_obj: if isinstance(obj, lttextlinehorizontal): text = clean_string(obj.get_text()) if text in values: table_headers[text] = header(text, obj.bbox[0], obj.bbox[1]) elif isinstance(obj, lttextboxhorizontal): self.set_table_headers(obj, table_headers) return table_headers def set_cells(self, page, headers, cells=[]): cell = namedtuple("cell", "col, text, x0, y0") col = none text = none obj in page: if isinstance(obj, lttextlinehorizontal): obj_x0 = obj.bbox[0] obj_y0 = obj.bbox[1] if obj_y0 < headers['name'].y0 , (obj_x0 == headers['name'].x0 or obj_x0 == headers['value'].x0 or obj_x0 == headers['reference range'].x0): if obj_x0 == headers['name'].x0: col = 'name' elif obj_x0 == headers['value'].x0: col = 'value' elif obj_x0 == headers['reference range'].x0: col = 'reference range' text = clean_string(obj.get_text()) if text: cells.append(cell(col, text, obj_x0, obj_y0)) elif isinstance(obj, lttextboxhorizontal): self.set_cells(obj, headers, cells) return cells def set_rows(self, cells): cell = namedtuple("cell", "col, text, x0, y0") row = namedtuple('row', 'test, value, ref_range, y0') name_col = [cell cell in cells if cell.col == 'name'] value_col = [cell cell in cells if cell.col == 'value'] ref_col = [cell cell in cells if cell.col == 'reference range'] # normalize val col blank cells match name col length values_y0 = [cell.y0 cell in value_col] missing_val_cells = [cell.y0 cell in name_col if cell.y0 not in values_y0] value_col += [cell('value', none, none, y) y in missing_val_cells] rows = [row(name_cell.text, value_cell.text, ref_cell.text, name_cell.y0) name_cell in name_col value_cell in value_col ref_cell in ref_col if name_cell.y0 == value_cell.y0 == ref_cell.y0] return rows def parse_pages(self): pages = self.process_file(self.file_name) self.set_metadata(pages[0]) page in pages: self.table_headers = self.set_table_headers(page) self.cells = self.set_cells(page, self.table_headers) self.rows = self.set_rows(self.cells) if __name__ == "__main__": file_1 = pdftable("rawdata/pdfs/3768958-2.pdf") file_1.parse_pages() print("file_1 cells") print tabulate(file_1.cells, headers="keys", showindex="always") file_2 = pdftable("rawdata/pdfs/3768959.pdf") file_2.parse_pages() print("\nfile_2 cells") print tabulate(file_2.cells, headers="keys", showindex="always")
file_1.cells
col text x0 y0 -- --------------- --------------- -------- ------- 0 name tp 42.8571 570.887 1 name rin 42.8571 554.172 2 value 13.5 221.716 570.887 3 value 1.0 221.716 554.172 4 reference range 11.8-14.2 (sec) 412.555 570.887 5 reference range 0.8-1.2 412.555 554.172
file_2.cells
col text x0 y0 -- --------------- -------------------- -------- ------- 0 name tp 42.8571 570.887 1 name rin 42.8571 554.172 2 value 13.5 221.716 570.887 3 value 1.0 221.716 554.172 4 reference range 11.8-14.2 (sec) 412.555 570.887 5 reference range 0.8-1.2 412.555 554.172 6 name rsw 42.8571 570.887 7 name bcw 42.8571 554.172 8 value 8.7 221.716 570.887 9 value 25.6 221.716 554.172 10 reference range 4.5-12.5 412.555 570.887 11 reference range 14.0-30.0 412.555 554.172
expected file_2.cells
col text x0 y0 -- --------------- -------------------- -------- ------- 0 name rsw 42.8571 570.887 1 name bcw 42.8571 554.172 2 value 8.7 221.716 570.887 3 value 25.6 221.716 554.172 4 reference range 4.5.-12.5 412.555 570.887 5 reference range 14.0-30.0 412.555 554.172
not file_1.cells prepended file_2.cells after processing file_2, file_1.cells combination of cells both instances.
you have mutable default arguments table_headers={}
, cells=[]
issue, or @ least can cause other issues. these values shared across calls methods changes in 1 call reflected elsewhere.
Comments
Post a Comment