Skip to content

Commit

Permalink
fix the str problem
Browse files Browse the repository at this point in the history
  • Loading branch information
PascalSun committed Jun 24, 2024
1 parent faf5a65 commit 3e8fb42
Showing 1 changed file with 12 additions and 6 deletions.
18 changes: 12 additions & 6 deletions Docs2KG/kg/pdf_layout_kg.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,14 +269,17 @@ def link_image_to_context(self):
"uuid": str(uuid4()),
"node_properties": {
"text_block_bbox": text_block["bbox"],
"content": text_block["text"],
"content": str(text_block["text"]),
"position": key,
"text_block_number": int(text_block["block_number"]),
},
"children": [],
}
)
nearby_info_dict[key] = {"content": text_block["text"], "uuids": []}
nearby_info_dict[key] = {
"content": str(text_block["text"]),
"uuids": [],
}
"""
We also need to loop the nodes within this page
if the text block is highly similar to a content node, then we can link them together
Expand Down Expand Up @@ -352,14 +355,17 @@ def link_table_to_context(self):
"uuid": str(uuid4()),
"node_properties": {
"text_block_bbox": text_block["bbox"],
"content": text_block["text"],
"content": str(text_block["text"]),
"position": key,
"text_block_number": int(text_block["block_number"]),
},
"children": [],
}
)
nearby_info_dict[key] = {"content": text_block["text"], "uuids": []}
nearby_info_dict[key] = {
"content": str(text_block["text"]),
"uuids": [],
}
nearby_info_dict = self.link_image_to_tree_node(page_node, nearby_info_dict)
for item in nearby_info:
key = item["node_properties"]["position"]
Expand Down Expand Up @@ -492,7 +498,7 @@ def _create_tree_node(cls, tag: str, node: dict) -> dict:
"""
node_uuid = str(uuid4())
node_properties = {
"content": node.get("content", ""),
"content": str(node.get("content", "")),
"text": json.dumps(node) if tag == "table" else "",
"records": node.get("children", []) if tag == "table" else [],
}
Expand Down Expand Up @@ -566,12 +572,12 @@ def link_image_to_tree_node(self, page_node: dict, nearby_info_dict: dict) -> di
for child in page_node["children"]:
# get the text
content = child["node_properties"].get("content", "")
content = str(content)
nearby_info_dict = self.link_image_to_tree_node(child, nearby_info_dict)
if content.strip() == "":
continue
for key, value in nearby_info_dict.items():
# get all the value to string to be consistent
content = str(content)
value_content = str(value["content"])
if content == value_content:
value["uuids"].append(child["uuid"])
Expand Down

0 comments on commit 3e8fb42

Please sign in to comment.