Extract table of content from docx python

30 views Asked by At

So, currently I have been trying to extract table of content, not just the table of content but extract it differentiating headers from sub-headers and nested sub-headers as well Table of content structure

I have been trying to make a script which makes a dictionary similar to this structure:

{
  "headers": {
    "Table of Contents": [
      {
        "name": "Introduction",
        "subheaders": [
          {
            "name": "Permits for the Introduction of Certain Regulated Articles",
            "subheaders": [
              {
                "name": "Regulated Articles Under 7 CFR part 340"
              },
              {
                "name": "Exemptions Under ยง 340.2(b)"
              }
            ]
          },
          {
            "name": "Other Federal and State Regulations"
          },
          {
            "name": "Containment Facilities"
          },
          {
            "name": "Time Frame for Review and Issuance of a Permit"
          },
          {
            "name": "NEPA Document (EA or EIS)"
          }
        ]
      },
      {
        "name": "Applying for a Permit",
        "subheaders": [
          {
            "name": "Select the Type of Application and Submission Method",
            "subheaders": [
              {
                "name": "Electronic Permit Application"
              },
              {
                "name": "Paper Submission"
              }
            ]
          },
          {
            "name": "Set up an ePermits Account"
          },
          {
            "name": "Helpful Tips for ePermits"
          },
          {
            "name": "Create a Permit Application",
            "subheaders": [
              {
                "name": "Permit or Notification"
              },
              {
                "name": "Select Agent"
              },
              {
                "name": "Submission Method"
              },
              {
                "name": "Application Ownership - Responsible Person or Preparer"
              },
              {
                "name": "Select New Permit, Amendment, Renewal",
                "subheaders": [
                  {
                    "name": "Amendments"
                  },
                  {
                    "name": "Renewal"
                  }
                ]
              },
              {
                "name": "Courtesy Permit"
              },
              {
                "name": "Select the Introduction Type",
                "subheaders": [
                  {
                    "name": "Number of Releases, Points of Origins, Destinations and Duration"
                  }
                ]
              },
              {
                "name": "Select Confidential Business Information (CBI) or No CBI"
              }
            ]
          }
        ]
      },
      {
        "name": "Application Submission - Data Requirements",
        "subheaders": [
          {
            "name": "Provide a CBI Justification Statement"
          },
          {
            "name": "Purpose of Permit",
            "subheaders": [
              {
                "name": "Industrial Product"
              },
              {
                "name": "Pharmaceutical Product"
              },
              {
                "name": "Phytoremediation"
              }
            ]
          }
        ]
      }
    ]
  }
}

This is what I've come up with so far, still cant figure out the higher levels of nesting (sub-sub-sub-headers and deeper )

from docx import Document

def detect_headers_and_subheaders(doc):
    headers = {}
    current_headers = [headers]  # Stack to track nested headers
    current_levels = [0]  # Stack to track current levels

    for paragraph in doc.paragraphs:
        if paragraph.style.name.startswith('Heading'):
            header_text = paragraph.text.strip()
            level = int(paragraph.style.name.split(' ')[-1])

            if level == 1:
                headers[header_text] = {}
                current_headers = [headers]
                current_levels = [1]
            elif level > current_levels[-1]:
                current_headers[-1][header_text] = {}
                current_headers.append(current_headers[-1][header_text])
                current_levels.append(level)
            elif level == current_levels[-1]:
                current_headers[-2][header_text] = {}
                current_headers[-1][header_text] = {}
            else:
                while level <= current_levels[-1]:
                    current_headers.pop()
                    current_levels.pop()
                current_headers[-1][header_text] = {}
                current_headers.append(current_headers[-1][header_text])
                current_levels.append(level)

    return headers

def main():
    doc = Document('doc_file.docx')
    structured_content = detect_headers_and_subheaders(doc)
    print(structured_content)

if __name__ == "__main__":
    main()

I tried using adobe-extract api with python and parse the json but still facing the same problem.

0

There are 0 answers