python - Values from a class instance's attribute being added to a different instance of the same class -


this question has answer here:

i'm parsing pdfs extract table data using pdftable class. when create class instance create class instance seems first class instance file_1.cells being prepended second class instance file_2.cells. cannot figure out why happening don't think i'm creating class variables instance variables. reason data set_cells persisted when class instance instantiated. happening?

from pdfminer.pdfdocument import pdfdocument pdfminer.pdfpage import pdfpage pdfminer.pdfparser import pdfparser pdfminer.pdfinterp import pdfresourcemanager, pdfpageinterpreter pdfminer.converter import pdfpageaggregator pdfminer.layout import laparams, lttextbox, lttextboxhorizontal, lttextlinehorizontal tabulate import tabulate utils import clean_string collections import namedtuple   class pdftable(object):      def __init__(self, file_name):         self.file_name = file_name         self.table_headers = none         self.cells = none         self.rows = none      def process_file(self, file_name):         pages = []         open(file_name, 'rb') fp:             parser = pdfparser(fp)             doc = pdfdocument(parser)             rsrcmgr = pdfresourcemanager()             laparams = laparams()             device = pdfpageaggregator(rsrcmgr, laparams=laparams)             interpreter = pdfpageinterpreter(rsrcmgr, device)              page in pdfpage.create_pages(doc):                 interpreter.process_page(page)                 pages.append(device.get_result())          return pages      def set_table_headers(self, page_obj, table_headers={}):         values = ('name', 'value', 'reference range')         header = namedtuple('header', 'name, x0, y0')          obj in page_obj:             if isinstance(obj, lttextlinehorizontal):                 text = clean_string(obj.get_text())                 if text in values:                     table_headers[text] = header(text, obj.bbox[0], obj.bbox[1])             elif isinstance(obj, lttextboxhorizontal):                 self.set_table_headers(obj, table_headers)          return table_headers      def set_cells(self, page, headers, cells=[]):         cell = namedtuple("cell", "col, text, x0, y0")         col = none         text = none          obj in page:             if isinstance(obj, lttextlinehorizontal):                 obj_x0 = obj.bbox[0]                 obj_y0 = obj.bbox[1]                  if obj_y0 < headers['name'].y0 , (obj_x0 == headers['name'].x0 or                    obj_x0 == headers['value'].x0 or obj_x0 == headers['reference range'].x0):                      if obj_x0 == headers['name'].x0:                         col = 'name'                     elif obj_x0 == headers['value'].x0:                         col = 'value'                     elif obj_x0 == headers['reference range'].x0:                         col = 'reference range'                      text = clean_string(obj.get_text())                     if text:                         cells.append(cell(col, text, obj_x0, obj_y0))              elif isinstance(obj, lttextboxhorizontal):                 self.set_cells(obj, headers, cells)          return cells      def set_rows(self, cells):         cell = namedtuple("cell", "col, text, x0, y0")         row = namedtuple('row', 'test, value, ref_range, y0')          name_col = [cell cell in cells if cell.col == 'name']         value_col = [cell cell in cells if cell.col == 'value']         ref_col = [cell cell in cells if cell.col == 'reference range']          # normalize val col blank cells match name col length         values_y0 = [cell.y0 cell in value_col]         missing_val_cells = [cell.y0 cell in name_col if cell.y0 not in values_y0]         value_col += [cell('value', none, none, y) y in missing_val_cells]          rows = [row(name_cell.text, value_cell.text, ref_cell.text, name_cell.y0)                 name_cell in name_col value_cell in value_col                 ref_cell in ref_col                 if name_cell.y0 == value_cell.y0 == ref_cell.y0]          return rows      def parse_pages(self):         pages = self.process_file(self.file_name)         self.set_metadata(pages[0])          page in pages:             self.table_headers = self.set_table_headers(page)             self.cells = self.set_cells(page, self.table_headers)          self.rows = self.set_rows(self.cells)   if __name__ == "__main__":     file_1 = pdftable("rawdata/pdfs/3768958-2.pdf")     file_1.parse_pages()      print("file_1 cells")     print tabulate(file_1.cells, headers="keys", showindex="always")      file_2 = pdftable("rawdata/pdfs/3768959.pdf")     file_2.parse_pages()      print("\nfile_2 cells")     print tabulate(file_2.cells, headers="keys", showindex="always") 

file_1.cells

    col              text                   x0       y0 --  ---------------  ---------------  --------  -------  0  name             tp                42.8571  570.887  1  name             rin               42.8571  554.172  2  value            13.5             221.716   570.887  3  value            1.0              221.716   554.172  4  reference range  11.8-14.2 (sec)  412.555   570.887  5  reference range  0.8-1.2          412.555   554.172 

file_2.cells

    col              text                        x0       y0 --  ---------------  --------------------  --------  -------  0  name             tp                     42.8571  570.887  1  name             rin                    42.8571  554.172  2  value            13.5                  221.716   570.887  3  value            1.0                   221.716   554.172  4  reference range  11.8-14.2 (sec)       412.555   570.887  5  reference range  0.8-1.2               412.555   554.172  6  name             rsw                    42.8571  570.887  7  name             bcw                    42.8571  554.172  8  value            8.7                   221.716   570.887  9  value            25.6                  221.716   554.172 10  reference range  4.5-12.5              412.555   570.887 11  reference range  14.0-30.0             412.555   554.172 

expected file_2.cells

    col              text                        x0       y0 --  ---------------  --------------------  --------  -------  0  name             rsw                    42.8571  570.887  1  name             bcw                    42.8571  554.172  2  value            8.7                   221.716   570.887  3  value            25.6                  221.716   554.172  4  reference range  4.5.-12.5             412.555   570.887  5  reference range  14.0-30.0             412.555   554.172 

not file_1.cells prepended file_2.cells after processing file_2, file_1.cells combination of cells both instances.

you have mutable default arguments table_headers={} , cells=[] issue, or @ least can cause other issues. these values shared across calls methods changes in 1 call reflected elsewhere.


Comments

Popular posts from this blog

php - How to display all orders for a single product showing the most recent first? Woocommerce -

asp.net - How to correctly use QUERY_STRING in ISAPI rewrite? -

angularjs - How restrict admin panel using in backend laravel and admin panel on angular? -