# IfcOpenShell - IFC toolkit and geometry engine # Copyright (C) 2022, 2023 @Andrej730 # # This file is part of IfcOpenShell. # # IfcOpenShell is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # IfcOpenShell is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with IfcOpenShell. If not, see . import copy import json from pathlib import Path from typing import Optional, TypedDict, Union from typing_extensions import NotRequired import ifcopenshell import ifcopenshell.ifcopenshell_wrapper as ifcopenshell_wrapper import ifcopenshell.util.attribute import ifcopenshell.util.schema try: import glob import re import shutil import urllib.parse import warnings import zipfile import requests from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning from lxml import etree from markdown import markdown except: pass # Only necessary if you're using it to generate the docs database BASE_MODULE_PATH = Path(__file__).parent IFC2x3_DOCS_LOCATION = BASE_MODULE_PATH / "Ifc2.3.0.1" IFC4_DOCS_LOCATION = BASE_MODULE_PATH / "Ifc4.0.2.1" IFC4x3_HTML_LOCATION = BASE_MODULE_PATH / "IFC4.3-html" IFC4x3_DEV_LOCATION = BASE_MODULE_PATH / "IFC4.3.x-development" IFC4x3_SPEC_URL_TEMPLATE = "https://ifc43-docs.standards.buildingsmart.org/IFC/RELEASE/IFC4x3/HTML/lexical/%s.htm" class BaseData(TypedDict): description: str spec_url: str class EntityData(BaseData): attributes: NotRequired[dict[str, str]] predefined_types: NotRequired[dict[str, str]] class PsetData(TypedDict): # Apparently some psets in ifc4 are missing spec url / description. description: NotRequired[str] spec_url: NotRequired[str] properties: dict[str, str] class PropertyData(TypedDict): description: str # in IFC4x3 there is no children[] for properties children: NotRequired[dict[str, "PropertyData"]] class ClassesSuggestions(TypedDict): name: str predefined_type: NotRequired[str] class SchemaData(TypedDict): entities: dict[str, EntityData] types: dict[str, BaseData] properties: dict[str, PsetData] classes_suggestions: dict[str, ClassesSuggestions] SUPPORTED_SCHEMA = ifcopenshell.util.schema.IFC_SCHEMA SCHEMA_FILES: dict[SUPPORTED_SCHEMA, dict[str, Path]] = { "IFC2X3": { "entities": BASE_MODULE_PATH / "schema/ifc2x3_entities.json", "properties": BASE_MODULE_PATH / "schema/ifc2x3_properties.json", "types": BASE_MODULE_PATH / "schema/ifc2x3_types.json", "classes_suggestions": BASE_MODULE_PATH / "schema/ifc_classes_suggestions.json", }, "IFC4": { "entities": BASE_MODULE_PATH / "schema/ifc4_entities.json", "properties": BASE_MODULE_PATH / "schema/ifc4_properties.json", "types": BASE_MODULE_PATH / "schema/ifc4_types.json", "classes_suggestions": BASE_MODULE_PATH / "schema/ifc_classes_suggestions.json", }, "IFC4X3": { "entities": BASE_MODULE_PATH / "schema/ifc4x3_entities.json", "properties": BASE_MODULE_PATH / "schema/ifc4x3_properties.json", "types": BASE_MODULE_PATH / "schema/ifc4x3_types.json", "classes_suggestions": BASE_MODULE_PATH / "schema/ifc_classes_suggestions.json", }, } db: dict[SUPPORTED_SCHEMA, SchemaData] = None schema_by_name: dict[SUPPORTED_SCHEMA, Optional[ifcopenshell_wrapper.schema_definition]] = { "IFC2X3": None, "IFC4": None, "IFC4X3": None, } def get_db(version: ifcopenshell.util.schema.IFC_SCHEMA) -> Union[SchemaData, None]: global db if not db: db = {ifc_version: dict() for ifc_version in SCHEMA_FILES} for ifc_version in SCHEMA_FILES: for data_type in SCHEMA_FILES[ifc_version]: schema_path = SCHEMA_FILES[ifc_version][data_type] if not schema_path.is_file(): print(f"Schema file {schema_path} wasn't found.") files_missing = True continue with open(schema_path, "r") as fi: db[ifc_version][data_type] = json.load(fi) version = ifcopenshell.util.schema.get_fallback_schema(version) return db.get(version) def get_schema_by_name(version: str) -> ifcopenshell_wrapper.schema_definition: global schema_by_name version = ifcopenshell.util.schema.get_fallback_schema(version) if not schema_by_name[version]: schema_by_name[version] = ifcopenshell.schema_by_name(version) return schema_by_name[version] def get_class_suggestions( version: ifcopenshell.util.schema.IFC_SCHEMA, class_name: str, ) -> Union[ClassesSuggestions, None]: db = get_db(version) if not db: return class_suggestions = db["classes_suggestions"].get(class_name) return class_suggestions def get_entity_doc( version: ifcopenshell.util.schema.IFC_SCHEMA, entity_name: str, recursive: bool = True, ) -> Union[EntityData, None]: db = get_db(version) if db: entity = copy.deepcopy(db["entities"].get(entity_name)) if not recursive: return entity ifc_schema = get_schema_by_name(version) ifc_entity = ifc_schema.declaration_by_name(entity_name) ifc_supertype = ifc_entity.supertype() if ifc_supertype: parent_entity = get_entity_doc(version, ifc_supertype.name(), recursive=True) if "attributes" not in entity: entity["attributes"] = dict() for parent_attr in parent_entity.get("attributes", []): entity["attributes"][parent_attr] = parent_entity["attributes"][parent_attr] return entity def get_attribute_doc( version: ifcopenshell.util.schema.IFC_SCHEMA, entity: str, attribute: str, recursive=True, ) -> Union[str, None]: db = get_db(version) if db: entity_ = get_entity_doc(version, entity, recursive) if entity_ and "attributes" in entity_: return entity_["attributes"].get(attribute) def get_predefined_type_doc( version: ifcopenshell.util.schema.IFC_SCHEMA, entity: str, predefined_type: str, ) -> Union[str, None]: db = get_db(version) if db: entity_ = db["entities"].get(entity) if entity_: return entity_.get("predefined_types", {}).get(predefined_type) def get_property_set_doc(version: ifcopenshell.util.schema.IFC_SCHEMA, pset: str) -> Union[PsetData, None]: db = get_db(version) if db: return db["properties"].get(pset) def get_property_doc(version: ifcopenshell.util.schema.IFC_SCHEMA, pset: str, prop: str) -> Union[str, None]: db = get_db(version) if db: pset_ = db["properties"].get(pset) if pset_: return pset_["properties"].get(prop) def get_type_doc(version: ifcopenshell.util.schema.IFC_SCHEMA, ifc_type: str) -> Union[BaseData, None]: db = get_db(version) if db: return db["types"].get(ifc_type) # TODO: there are still some discrepancies between this method # and the specs website because of the asymmetry # More: https://github.com/buildingSMART/IFC4.3.x-development/issues/582 def get_inverse_attributes(el): inverse_attrs = [] for a in el.all_inverse_attributes(): attribute_type = a.attribute_reference().type_of_attribute() # unpacking aggregation types while isinstance(attribute_type, ifcopenshell.ifcopenshell_wrapper.aggregation_type): attribute_type = attribute_type.type_of_element() attribute_type = attribute_type.declared_type() # recursively looking for entities inside the selections types_to_process = [attribute_type] entity_attr_types = [] while types_to_process: for attr_type in types_to_process.copy(): if isinstance(attr_type, ifcopenshell.ifcopenshell_wrapper.select_type): types_to_process.extend([t for t in attr_type.select_list()]) else: entity_attr_types.append(attr_type.name()) types_to_process.remove(attr_type) if el.name() in entity_attr_types: inverse_attrs.append(a) return inverse_attrs class DocExtractor: def clean_highlighted_words(self, text: str) -> str: text = re.sub(r"\b_([a-zA-Z0-9]+)_\b", r"\1", text) text = re.sub(r"\*\*([a-zA-Z0-9]+)\*\*", r"\1", text) return text def clean_description(self, description): description = description.replace("\n", " ") description = description.replace("\u00a0", " ") description = description.split("HISTORY:", 1)[0] description = description.strip() return description def extract_ifc2x3(self): print("Parsing data for Ifc2.3.0.1") if not IFC2x3_DOCS_LOCATION.is_dir(): raise Exception( f'Docs for IFC 2.3.0.1 expected to be in folder "{IFC2x3_DOCS_LOCATION.resolve()}\\"\n' "For doc extraction please either setup docs as described above \n" "or change IFC2x3_DOCS_LOCATION in the script accordingly.\n" "You can download docs from the repository: \n" "https://github.com/buildingSMART/IFC/tree/Ifc2.3.0.1" ) # need to parse actual domains from the website # since domains from github paths do not match domains from the websites # probably due domains on the website being from 4_0 # example (property set / github domain / website domain): # Pset_AirTerminalBoxPHistory IfcControlExtension IfcHvacDomain self.extract_ifc2x3_property_sets_site_domains() self.extract_ifc2x3_entities() self.extract_ifc2x3_property_sets() self.extract_ifc2x3_types() def extract_ifc2x3_property_sets_site_domains(self): property_sets_domains = dict() r = requests.get("https://standards.buildingsmart.org/IFC/RELEASE/IFC2x3/TC1/HTML/psd/psd_index.htm") html = BeautifulSoup(r.content, features="lxml") for a in html.find_all("a"): domain, pset = a["href"].removeprefix("./").removesuffix(".xml").split("/") property_sets_domains[pset] = domain # export property sets data with open(BASE_MODULE_PATH / "schema/ifc2x3_property_sets_site_domains.json", "w", encoding="utf-8") as fo: print(f"{len(property_sets_domains)} property sets domains were parsed from the website") json.dump(property_sets_domains, fo, sort_keys=True, indent=4) def setup_ifc2x3_reference_lookup(self): # setup references look up tables to convert property hrefs to actual data paths references_paths_lookup = dict() glob_query = f"{IFC2x3_DOCS_LOCATION}/Constants/*/*" parsed_paths = [filepath for filepath in glob.iglob(f"{IFC2x3_DOCS_LOCATION}/Properties/*/*", recursive=False)] parsed_paths += [filepath for filepath in glob.iglob(f"{IFC2x3_DOCS_LOCATION}/Constants/*/*", recursive=False)] for parsed_path in parsed_paths: parsed_path = Path(parsed_path) # all references omit "$" character, I've checked it on 2_3 # need to check it if moving to next IFC version property_reference = parsed_path.stem.replace("$", "") references_paths_lookup[property_reference] = parsed_path return references_paths_lookup def extract_ifc2x3_entities(self): ifc2x3_references_paths_lookup = self.setup_ifc2x3_reference_lookup() ifc4_references_paths_lookup = self.setup_ifc4_reference_lookup() entities_dict = dict() # search entities_paths = [ filepath for filepath in glob.iglob(f"{IFC2x3_DOCS_LOCATION}/Sections/**/Entities", recursive=True) ] for parse_folder_path in entities_paths: for entity_path in glob.iglob(f"{parse_folder_path}/**/"): entity_path = Path(entity_path) entity_name = entity_path.stem entities_dict[entity_name] = dict() # utf-8-sig because of \ufeff occcurs - meaning it's utf bom encoded md_path = entity_path / "Documentation.md" xml_path = entity_path / "DocEntity.xml" md_url_part = urllib.parse.quote(str(md_path.relative_to(Path(__file__).parent).as_posix())) github_md_url = f"https://github.com/buildingSMART/IFC/blob/{md_url_part}" with open(md_path, "r", encoding="utf-8-sig") as fi: # convert markdown to html for easier parsing html = markdown(fi.read()) entity_description = BeautifulSoup(html, features="lxml").find("p").text entity_description = entity_description.replace("\n", " ") entity_description = entity_description.replace("\u00a0", " ") with open(xml_path, "r", encoding="utf-8") as fi: bs_tree = BeautifulSoup(fi.read(), features="lxml") entity_attrs = dict() predefined_types = dict() # temporarily disable MarkupResemblesLocatorWarning # because BeautifulSoup wrongly assume we confused # html code for filepath and gives warnings with warnings.catch_warnings(): warnings.simplefilter("ignore", category=MarkupResemblesLocatorWarning) for html_attr in bs_tree.find_all("docattribute"): attr_name = html_attr["name"] if attr_name == "PredefinedType": # get references to all predefined types defined_type = html_attr["definedtype"] enum_path = xml_path.parents[2] / "Types" / defined_type / "DocEnumeration.xml" with open(enum_path, "r", encoding="utf-8") as fi: enum_bs_tree = BeautifulSoup(fi.read(), features="lxml") hrefs = [i["href"] for i in enum_bs_tree.find_all("docconstant")] # iterate over list of predefined types for href in hrefs: # in IFC2X3 all documentation for constants is empty # and as a temporary solution I'm trying to get constant's description from IFC4 const_path = ifc4_references_paths_lookup.get( href, ifc2x3_references_paths_lookup[href] ) with open(const_path, "r", encoding="utf-8") as fi: const_bs_tree = BeautifulSoup(fi.read(), features="lxml") const_name = const_bs_tree.find("docconstant")["name"] description_tag = const_bs_tree.find("documentation") const_description = "" if not description_tag else description_tag.text predefined_types[const_name] = const_description else: html_description = BeautifulSoup(html_attr.text, features="lxml") attr_description = html_description.get_text() attr_description = attr_description.replace("\n", " ") attr_description = attr_description.replace("\u00a0", " ") attr_description = attr_description.replace("&npsp;", " ") # discard part of the description with changelog # Example: # https://standards.buildingsmart.org/IFC/RELEASE/IFC2x3/TC1/HTML/ifcpresentationdefinitionresource/lexical/ifcannotationfillarea.htm attr_description = attr_description.split("IFC2x Edition 3 CHANGE", 1)[0] attr_description = attr_description.split("IFC2x Edition 2 Addendum 2 CHANGE", 1)[0] attr_description = attr_description.split("IFC2x2 Addendum 1 change", 1)[0] attr_description = attr_description.split("IFC2x PLATFORM CHANGE", 1)[0] attr_description = attr_description.split("IFC2x3 CHANGE", 1)[0] attr_description = attr_description.split("IFC2x Edition3 CHANGE", 1)[0] attr_description = attr_description.strip().rstrip(">").strip() entity_attrs[attr_name] = attr_description if entity_attrs: entities_dict[entity_name]["attributes"] = entity_attrs if predefined_types: entities_dict[entity_name]["predefined_types"] = predefined_types entities_dict[entity_name]["description"] = entity_description spec_url = ( "https://standards.buildingsmart.org/IFC/RELEASE/IFC2x3/TC1/HTML/" f"{md_path.parents[2].name.lower()}/lexical/{entity_name.lower()}.htm" ) entities_dict[entity_name]["spec_url"] = spec_url # export entities data with open(BASE_MODULE_PATH / "schema/ifc2x3_entities.json", "w", encoding="utf-8") as fo: print(f"{len(entities_dict)} entities parsed") json.dump(entities_dict, fo, sort_keys=True, indent=4) def extract_ifc2x3_property_sets(self): property_sets_dict = dict() property_sets_references = dict() # extract lists of properties and theirs references for each property set parsed_paths = [ filepath for filepath in glob.iglob(f"{IFC2x3_DOCS_LOCATION}/Sections/**/PropertySets", recursive=True) ] # prepare property sets domains from the website we extracted earlier with open(BASE_MODULE_PATH / "schema/ifc2x3_property_sets_site_domains.json", "r") as fi: property_sets_site_domains = json.load(fi) for parse_folder_path in parsed_paths: for property_set_path in glob.iglob(f"{parse_folder_path}/**/"): property_set_path = Path(property_set_path) property_set_name = property_set_path.stem property_set_dict = dict() property_references = list() xml_path = property_set_path / "DocPropertySet.xml" md_path = property_set_path / "Documentation.md" if md_path.is_file(): with open(md_path, "r", encoding="utf-8-sig") as fi: # convert markdown to html for easier parsing html = markdown(fi.read()) property_set_description = BeautifulSoup(html, features="lxml").find("p").text property_set_description = property_set_description.replace("\n", " ") property_set_description = property_set_description.split("HISTORY:", 1)[0] property_set_description = property_set_description.strip() property_set_dict["description"] = property_set_description else: print( f"WARNING. Property set {property_set_name} has no Documentation.md, " f"property set will be left without description." ) with open(xml_path, "r", encoding="utf-8") as fi: bs_tree = BeautifulSoup(fi.read(), features="lxml") for html_attr in bs_tree.find_all("docproperty"): property_references.append(html_attr["href"]) property_sets_references[property_set_name] = property_references property_set_domain = property_sets_site_domains[property_set_name] spec_url = ( "https://standards.buildingsmart.org/IFC/RELEASE/IFC2x3/TC1/HTML" f"/psd/{property_set_domain}/{property_set_name}.xml" ) property_set_dict["spec_url"] = spec_url property_sets_dict[property_set_name] = property_set_dict # setup references look up tables to convert property hrefs to actual data paths references_paths_lookup = self.setup_ifc2x3_reference_lookup() # setup a function because we'll need to check child properties recusively def get_property_info_by_href(href): property_dict = dict() property_path = references_paths_lookup[href] md_path = property_path / "Documentation.md" xml_path = property_path / "DocProperty.xml" md_url_part = urllib.parse.quote(str(md_path.relative_to(Path(__file__).parent).as_posix())) xml_url_part = urllib.parse.quote(str(xml_path.relative_to(Path(__file__).parent).as_posix())) github_md_url = f"https://github.com/buildingSMART/IFC/blob/{md_url_part}" github_xml_url = f"https://github.com/buildingSMART/IFC/blob/{xml_url_part}" with open(xml_path, "r", encoding="utf-8") as fi: bs_tree = BeautifulSoup(fi.read(), features="lxml") tags = bs_tree.find_all("docproperty") # check for child properties - if they are present parse their data recursively elements_tag = bs_tree.find("elements") if elements_tag is not None: child_tags = elements_tag.find_all("docproperty") child_tags_dict = dict() for child_tag in child_tags: child_tag_href = child_tag["href"] child_tag_name, child_tag_dict = get_property_info_by_href(child_tag_href) child_tags_dict[child_tag_name] = child_tag_dict tags.remove(child_tag) property_dict["children"] = child_tags_dict print(f"Child nodes found inside property xml. Url: {github_xml_url}") if len(tags) != 1: print( f"WARNING. Found more properties inside property xml, " f"only the first one was parsed (number of properties: {len(tags)}). Url: {github_xml_url}." ) property_name = tags[0]["name"] if not md_path.is_file(): print( f"WARNING. Property {property_name} is missing documentation.md, " f"property will be left without description. Url: {github_xml_url}" ) else: with open(md_path, "r", encoding="utf-8-sig") as fi: # convert markdown to html for easier parsing html = markdown(fi.read()) description = BeautifulSoup(html, features="lxml").find("p").text description = description.replace("\n", " ") description = description.replace("\u00a0", " ") property_dict["description"] = description return (property_name, property_dict) # lookup each property reference and save it's name and description for property_set_name in property_sets_references: properties_dict = dict() for property_reference in property_sets_references[property_set_name]: property_name, property_dict = get_property_info_by_href(property_reference) properties_dict[property_name] = property_dict property_sets_dict[property_set_name]["properties"] = properties_dict # export property sets data with open(BASE_MODULE_PATH / "schema/ifc2x3_properties.json", "w", encoding="utf-8") as fo: print(f"{len(property_sets_dict)} property sets parsed") json.dump(property_sets_dict, fo, sort_keys=True, indent=4) def extract_ifc2x3_types(self): types_dict = dict() # search types_paths = [filepath for filepath in glob.iglob(f"{IFC2x3_DOCS_LOCATION}/Sections/**/Types", recursive=True)] for parse_folder_path in types_paths: for type_path in glob.iglob(f"{parse_folder_path}/**/"): type_path = Path(type_path) type_name = type_path.stem types_dict[type_name] = dict() md_path = type_path / "Documentation.md" # utf-8-sig because of \ufeff occcurs - meaning it's utf bom encoded with open(md_path, "r", encoding="utf-8-sig") as fi: # convert markdown to html for easier parsing html = markdown(fi.read()) type_description = BeautifulSoup(html, features="lxml").find("p").text type_description = type_description.replace("\n", " ") type_description = type_description.replace("\u00a0", " ") type_description = type_description.replace("Definition from ISO/CD 10303-46:1992: ", "") type_description = type_description.replace("Definition from ISO/CD 10303-42:1992 ", "") type_description = type_description.replace("Definition from ISO/CD 10303-42:1992: ", "") type_description = type_description.replace("Definition from ISO/CD 10303-41:1992: ", "") type_description = type_description.strip() if type_description: types_dict[type_name]["description"] = type_description spec_url = ( "https://standards.buildingsmart.org/IFC/RELEASE/IFC2x3/TC1/HTML/" f"{md_path.parents[2].name.lower()}/lexical/{type_name.lower()}.htm" ) types_dict[type_name]["spec_url"] = spec_url # export entities data with open(BASE_MODULE_PATH / "schema/ifc2x3_types.json", "w", encoding="utf-8") as fo: print(f"{len(types_dict)} ifc types parsed") json.dump(types_dict, fo, sort_keys=True, indent=4) def extract_ifc4(self): print("Parsing data for Ifc4.0.2.1") if not IFC4_DOCS_LOCATION.is_dir(): raise Exception( f'Docs for Ifc4.0.2.1 expected to be in folder "{IFC4_DOCS_LOCATION.resolve()}\\"\n' "For doc extraction please either setup docs as described above \n" "or change IFC4_DOCS_LOCATION in the script accordingly.\n" "You can download docs from the repository: \n" "https://github.com/buildingSMART/IFC/tree/Ifc4.0.2.1" ) # actually domains in Ifc 4.0 are consistent between website and docs # BUT there are two property sets that site is missing and therefore they won't have spec_url # because of them I left the site parsing too # missed property sets: # Pset_BuildingElementCommon Pset_ElementCommon self.extract_ifc4_property_sets_site_domains() self.extract_ifc4_entities() self.extract_ifc4_property_sets() self.extract_ifc4_types() def extract_ifc4_property_sets_site_domains(self): property_sets_domains = dict() with requests.get( "https://standards.buildingsmart.org/IFC/RELEASE/IFC4/ADD2_TC1" "/HTML/annex/annex-b/alphabeticalorder_psets.htm" ) as r: html = BeautifulSoup(r.content, features="lxml") for a in html.find_all("a", {"class": "listing-link"}): href_split = a["href"].split("/") domain = href_split[3] pset = href_split[5].removesuffix(".htm") property_sets_domains[pset] = domain with requests.get( "https://standards.buildingsmart.org/IFC/RELEASE/IFC4/ADD2_TC1/" "/HTML/annex/annex-b/alphabeticalorder_qsets.htm" ) as r: html = BeautifulSoup(r.content, features="lxml") for a in html.find_all("a", {"class": "listing-link"}): href_split = a["href"].split("/") domain = href_split[3] pset = href_split[5].removesuffix(".htm") property_sets_domains[pset] = domain # export property sets data with open(BASE_MODULE_PATH / "schema/ifc4_property_sets_site_domains.json", "w", encoding="utf-8") as fo: print(f"{len(property_sets_domains)} property sets domains were parsed from the website") json.dump(property_sets_domains, fo, sort_keys=True, indent=4) def setup_ifc4_reference_lookup(self): references_paths_lookup = dict() parsed_paths = [filepath for filepath in glob.iglob(f"{IFC4_DOCS_LOCATION}/Properties/*/*", recursive=False)] parsed_paths += [filepath for filepath in glob.iglob(f"{IFC4_DOCS_LOCATION}/Quantities/*/*", recursive=False)] parsed_paths += [filepath for filepath in glob.iglob(f"{IFC4_DOCS_LOCATION}/Constants/*/*", recursive=False)] for parsed_path in parsed_paths: parsed_path = Path(parsed_path) # all references omit "$" character, I've checked it on 4_0 # need to check it if moving to next IFC version # btw no reason to check if all references were used in properties # because there are also child properties property_reference = parsed_path.stem.replace("$", "") references_paths_lookup[property_reference] = parsed_path return references_paths_lookup def extract_ifc4_entities(self): references_paths_lookup = self.setup_ifc4_reference_lookup() entities_dict = dict() # search entities_paths = [ filepath for filepath in glob.iglob(f"{IFC4_DOCS_LOCATION}/Sections/**/Entities", recursive=True) ] for parse_folder_path in entities_paths: for entity_path in glob.iglob(f"{parse_folder_path}/**/"): entity_path = Path(entity_path) entity_name = entity_path.stem entities_dict[entity_name] = dict() md_path = entity_path / "Documentation.md" xml_path = entity_path / "DocEntity.xml" md_url_part = urllib.parse.quote(str(md_path.relative_to(Path(__file__).parent).as_posix())) github_md_url = f"https://github.com/buildingSMART/IFC/blob/{md_url_part}" # utf-8-sig because of \ufeff occcurs - meaning it's utf bom encoded with open(md_path, "r", encoding="utf-8-sig") as fi: # convert markdown to html for easier parsing html = markdown(fi.read()) entity_description = BeautifulSoup(html, features="lxml").find("p").text entity_description = entity_description.replace("\n", " ") entity_description = entity_description.replace("\u00a0", " ") entity_description = entity_description.replace("{ .extDef}", "") entity_description = entity_description.strip() with open(xml_path, "r", encoding="utf-8") as fi: bs_tree = BeautifulSoup(fi.read(), features="lxml") entity_attrs = dict() predefined_types = dict() # temporarily disable MarkupResemblesLocatorWarning # because BeautifulSoup wrongly assume we confused # html code for filepath and gives warnings with warnings.catch_warnings(): warnings.simplefilter("ignore", category=MarkupResemblesLocatorWarning) for html_attr in bs_tree.find_all("docattribute"): attr_name = html_attr["name"] if attr_name == "PredefinedType": # get references to all predefined types defined_type = html_attr["definedtype"] enum_path = xml_path.parents[2] / "Types" / defined_type / "DocEnumeration.xml" with open(enum_path, "r", encoding="utf-8") as fi: enum_bs_tree = BeautifulSoup(fi.read(), features="lxml") hrefs = [i["href"] for i in enum_bs_tree.find_all("docconstant")] # iterate over list of predefined types for href in hrefs: const_path = references_paths_lookup[href] with open(const_path, "r", encoding="utf-8") as fi: const_bs_tree = BeautifulSoup(fi.read(), features="lxml") const_name = const_bs_tree.find("docconstant")["name"] description_tag = const_bs_tree.find("documentation") const_description = "" if not description_tag else description_tag.text predefined_types[const_name] = const_description else: html_description = BeautifulSoup(html_attr.text, features="lxml") attr_description = html_description.get_text() attr_description = attr_description.replace("\n", " ") attr_description = attr_description.replace("\u00a0", " ") # discard part of the description with changelog, notes and examples etc. # Those notes actually can be useful but we'll need a way to reformat them # Example: # https://standards.buildingsmart.org/IFC/RELEASE/IFC4/ADD2_TC1/HTML/schema/ifcsharedbldgelements/lexical/ifcrelconnectspathelements.htm attr_description = attr_description.split("{ .change-ifc", 1)[0] attr_description = attr_description.split("{ .note", 1)[0] attr_description = attr_description.split("{ .examples", 1)[0] attr_description = attr_description.split("{ .deprecated", 1)[0] attr_description = attr_description.split("{ .history", 1)[0] attr_description = attr_description.strip() entity_attrs[attr_name] = attr_description if entity_attrs: entities_dict[entity_name]["attributes"] = entity_attrs if predefined_types: entities_dict[entity_name]["predefined_types"] = predefined_types entities_dict[entity_name]["description"] = entity_description spec_url = ( "https://standards.buildingsmart.org/IFC/RELEASE/IFC4/ADD2_TC1/HTML/schema/" f"{md_path.parents[2].name.lower()}/lexical/{entity_name.lower()}.htm" ) entities_dict[entity_name]["spec_url"] = spec_url # entities_dict[entity_name]['github_url'] = github_md_url # export entities data with open(BASE_MODULE_PATH / "schema/ifc4_entities.json", "w", encoding="utf-8") as fo: print(f"{len(entities_dict)} entities parsed") json.dump(entities_dict, fo, sort_keys=True, indent=4) def extract_ifc4_property_sets(self): # function parses both property and quantity sets property_sets_dict = dict() property_sets_references = dict() # extract lists of properties and theirs references for each property set parsed_paths = [ filepath for filepath in glob.iglob(f"{IFC4_DOCS_LOCATION}/Sections/**/PropertySets", recursive=True) ] parsed_paths += [ filepath for filepath in glob.iglob(f"{IFC4_DOCS_LOCATION}/Sections/**/QuantitySets", recursive=True) ] # prepare property sets domains from the website we extracted earlier with open(BASE_MODULE_PATH / "schema/ifc4_property_sets_site_domains.json", "r") as fi: property_sets_site_domains = json.load(fi) psets_test = set() for parse_folder_path in parsed_paths: for property_set_path in glob.iglob(f"{parse_folder_path}/**/"): property_set_path = Path(property_set_path) property_set_name = property_set_path.stem property_set_dict = dict() property_references = list() property_quantity = property_set_path.parents[0].name == "QuantitySets" xml_path = property_set_path / ("DocQuantitySet.xml" if property_quantity else "DocPropertySet.xml") md_path = property_set_path / "Documentation.md" if md_path.is_file(): with open(md_path, "r", encoding="utf-8-sig") as fi: # convert markdown to html for easier parsing html = markdown(fi.read()) property_set_description = BeautifulSoup(html, features="lxml").find("p").text property_set_description = property_set_description.replace("\n", " ") property_set_description = property_set_description.split("HISTORY:", 1)[0] property_set_description = property_set_description.strip() property_set_dict["description"] = property_set_description else: print( f"WARNING. Property set {property_set_name} has no Documentation.md, " f"property set will be left without description." ) with open(xml_path, "r", encoding="utf-8") as fi: bs_tree = BeautifulSoup(fi.read(), features="lxml") for html_attr in bs_tree.find_all("docquantity" if property_quantity else "docproperty"): property_references.append(html_attr["href"]) property_sets_references[property_set_name] = property_references if property_set_name.lower() not in property_sets_site_domains: print( f"WARNING. {property_set_name} was not found on the spec website, " "this property set won't have any spec_url in schema." ) else: property_set_domain = property_sets_site_domains.get(property_set_name.lower(), "") spec_url = ( "https://standards.buildingsmart.org/IFC/RELEASE/IFC4/ADD2_TC1/HTML" f"/schema/{property_set_domain}" f"/{'qset' if property_quantity else 'pset'}" f"/{property_set_name.lower()}.htm" ) property_set_dict["spec_url"] = spec_url property_sets_dict[property_set_name] = property_set_dict # setup references look up tables to convert property hrefs to actual data paths references_paths_lookup = self.setup_ifc4_reference_lookup() # setup a function because we'll need to check child properties recusively def get_property_info_by_href(href): property_dict = dict() property_path = references_paths_lookup[href] property_quantity = property_path.parents[1].name == "Quantities" md_path = property_path / "Documentation.md" xml_path = property_path / ("DocQuantity.xml" if property_quantity else "DocProperty.xml") md_url_part = urllib.parse.quote(str(md_path.relative_to(Path(__file__).parent).as_posix())) github_md_url = f"https://github.com/buildingSMART/IFC/blob/{md_url_part}" xml_url_part = urllib.parse.quote(str(xml_path.relative_to(Path(__file__).parent).as_posix())) github_xml_url = f"https://github.com/buildingSMART/IFC/blob/{xml_url_part}" with open(xml_path, "r", encoding="utf-8") as fi: bs_tree = BeautifulSoup(fi.read(), features="lxml") tags = bs_tree.find_all("docquantity" if property_quantity else "docproperty") # check for child properties - if they are present parse their data recursively elements_tag = bs_tree.find("elements") if elements_tag is not None: child_tags = elements_tag.find_all("docquantity" if property_quantity else "docproperty") child_tags_dict = dict() for child_tag in child_tags: child_tag_href = child_tag["href"] child_tag_name, child_tag_dict = get_property_info_by_href(child_tag_href) child_tags_dict[child_tag_name] = child_tag_dict tags.remove(child_tag) property_dict["children"] = child_tags_dict print(f"Child nodes found inside property xml. Url: {github_xml_url}") if len(tags) != 1: print( f"WARNING. Found more properties inside property xml, " f"only the first one was parsed (number of properties: {len(tags)}). Url: {github_xml_url}." ) property_name = tags[0]["name"] if not md_path.is_file(): print( f"WARNING. Property {property_name} is missing documentation.md, property will be left without description. " f"Url: {github_xml_url}" ) else: with open(md_path, "r", encoding="utf-8-sig") as fi: # convert markdown to html for easier parsing html = markdown(fi.read()) description = BeautifulSoup(html, features="lxml").find("p").text description = description.replace("\n", " ") description = description.replace("\u00a0", " ") property_dict["description"] = description return (property_name, property_dict) # lookup each property reference and save it's name and description for property_set_name in property_sets_references: properties_dict = dict() for property_reference in property_sets_references[property_set_name]: property_name, property_dict = get_property_info_by_href(property_reference) properties_dict[property_name] = property_dict property_sets_dict[property_set_name]["properties"] = properties_dict # export property sets data with open(BASE_MODULE_PATH / "schema/ifc4_properties.json", "w", encoding="utf-8") as fo: print(f"{len(property_sets_dict)} property sets parsed") json.dump(property_sets_dict, fo, sort_keys=True, indent=4) def extract_ifc4_types(self): types_dict = dict() # search types_paths = [filepath for filepath in glob.iglob(f"{IFC4_DOCS_LOCATION}/Sections/**/Types", recursive=True)] for parse_folder_path in types_paths: for type_path in glob.iglob(f"{parse_folder_path}/**/"): type_path = Path(type_path) type_name = type_path.stem types_dict[type_name] = dict() md_path = type_path / "Documentation.md" # utf-8-sig because of \ufeff occcurs - meaning it's utf bom encoded with open(md_path, "r", encoding="utf-8-sig") as fi: # convert markdown to html for easier parsing html = markdown(fi.read().replace("{ .extDef}", "")) type_description = BeautifulSoup(html, features="lxml").find("p").text type_description = type_description.replace("\n", " ") type_description = type_description.replace("\u00a0", " ") type_description = type_description.replace("{ .extDef}", "") type_description = type_description.replace( "NOTE Definition according to ISO/CD 10303-41:1992 ", "" ) type_description = type_description.replace("Definition from ISO/CD 10303-41:1992: ", "") type_description = type_description.strip() if type_description: types_dict[type_name]["description"] = type_description spec_url = ( "https://standards.buildingsmart.org/IFC/RELEASE/IFC4/ADD2_TC1/HTML/schema/" f"{md_path.parents[2].name.lower()}/lexical/{type_name.lower()}.htm" ) types_dict[type_name]["spec_url"] = spec_url # export entities data with open(BASE_MODULE_PATH / "schema/ifc4_types.json", "w", encoding="utf-8") as fo: print(f"{len(types_dict)} ifc types parsed") json.dump(types_dict, fo, sort_keys=True, indent=4) def extract_ifc4x3(self): print("Parsing data for Ifc4.3.0.1") if not IFC4x3_DEV_LOCATION.is_dir(): raise Exception( f'Specs development repository for Ifc4.3.0.1 expected to be in folder "{IFC4x3_DEV_LOCATION.resolve()}\\"\n' "For doc extraction please either setup docs as described above \n" "or change IFC4x3_DEV_LOCATION in the script accordingly.\n" "You can download docs from the repository: \n" "https://github.com/buildingSMART/IFC4.3.x-development" ) if not IFC4x3_HTML_LOCATION.is_dir(): raise Exception( f'Formal release for Ifc4.3.0.1 expected to be in folder "{IFC4x3_HTML_LOCATION.resolve()}\\"\n' "For doc extraction please either setup docs as described above \n" "or change IFC4x3_HTML_LOCATION in the script accordingly.\n" "You can download docs from the repository: \n" "https://github.com/buildingsmart/ifc4.3-html" ) dev_code_path = IFC4x3_DEV_LOCATION / "code" description_json_path = dev_code_path / "entities_description.json" if not description_json_path.is_file(): shutil.copy( BASE_MODULE_PATH / "ifc4x3dev_scrape_data_for_docs.py", dev_code_path / "ifc4x3dev_scrape_data_for_docs.py", ) raise Exception( f'The entities description data expected to be located in \n"{description_json_path.resolve()}.\n' f"To generate it `ifc4x3dev_scrape_data_for_docs.py` will be copied from current folder to \n{dev_code_path}\n" "and you'll need to run in from `/code` folder.\nThis script will use development `server.py` " "module to extract entities descriptions.\n\n" "Before running it make sure you run `create_resources.sh` from `/code` folder first.\n" "You'll need to complete at least 3 commands from `create_resources.sh`:\n" " py extract_concepts_from_xmi.py ../schemas/IFC.xml\n" " py to_pset.py ../schemas/IFC.xml psd\n" " py parse_xmi.py ../schemas/IFC.xml" ) self.extract_ifc4x3_entities() self.extract_ifc4x3_property_sets() def extract_ifc4x3_entities(self): with open(IFC4x3_DEV_LOCATION / "code/entities_description.json", "r") as fi: entities_description = json.load(fi) entities_dict = dict() types_dict = dict() schema = ifcopenshell.ifcopenshell_wrapper.schema_by_name("IFC4X3_ADD2") for entity in schema.declarations(): entity_name = entity.name() entity_data = dict() entity_data["spec_url"] = IFC4x3_SPEC_URL_TEMPLATE % entity_name if entity_name not in entities_description: print( f"WARNING. Entity {entity_name} is not present in data parsed from DEV DOCUMENTATION " "even though it's present in ifcopenshell schema. It's description will be left empty." ) description = "" else: description = self.clean_highlighted_words(entities_description[entity_name]["description"]) entity_data["description"] = description # types = type_declaration + enumeration_type + select_type if not isinstance(entity, ifcopenshell.ifcopenshell_wrapper.entity): types_dict[entity_name] = entity_data continue # entities processing # assign attributes / predef types data parsed_attributes_data = entities_description[entity_name]["attributes"] parsed_predefined_types_data = entities_description[entity_name]["predefined_types"] attributes_data = dict() predefined_types = dict() # iterate over forward and inverse entity attributes # TODO: more eloquent way to get inverse attributes of the declaration? for a in list(entity.attributes()) + get_inverse_attributes(entity): attr_name = a.name() # predefined types if attr_name == "PredefinedType": for v in ifcopenshell.util.attribute.get_enum_items(a): if v not in parsed_predefined_types_data: print( f"WARNING. Predefined type {v} (of entity {entity_name}) is not present in data parsed from DEV DOCUMENTATION " "even though it's present in ifcopenshell schema. It's description will be left empty." ) description = "" else: description = self.clean_description(parsed_predefined_types_data[v]) predefined_types[v] = description continue # attributes if attr_name not in parsed_attributes_data: print( f"WARNING. Attribute {attr_name} (of entity {entity_name}) is not present in data parsed from DEV DOCUMENTATION " "even though it's present in ifcopenshell schema. It's description will be left empty." ) description = "" else: description = self.clean_description(parsed_attributes_data[attr_name]) attributes_data[attr_name] = description if attributes_data: entity_data["attributes"] = attributes_data if predefined_types: entity_data["predefined_types"] = predefined_types entities_dict[entity_name] = entity_data # export entities data with open(BASE_MODULE_PATH / "schema/ifc4x3_entities.json", "w", encoding="utf-8") as fo: print(f"{len(entities_dict)} entities parsed") json.dump(entities_dict, fo, sort_keys=True, indent=4) # export entities data with open(BASE_MODULE_PATH / "schema/ifc4x3_types.json", "w", encoding="utf-8") as fo: print(f"{len(types_dict)} ifc types parsed") json.dump(types_dict, fo, sort_keys=True, indent=4) def extract_ifc4x3_property_sets(self): pset_data_zip = IFC4x3_HTML_LOCATION / "IFC/RELEASE/IFC4x3/HTML/annex-a-psd.zip" pset_data_location = BASE_MODULE_PATH / "temp/annex-a-psd" with zipfile.ZipFile(pset_data_zip, "r") as fi_zip: fi_zip.extractall(pset_data_location) property_sets_dict = dict() for pset_path in glob.iglob(f"{pset_data_location}/*.xml"): pset_path = Path(pset_path) pset_name = pset_path.stem # pset / qset pset_type = True if pset_name.split("_")[0] == "Pset" else False pset_data = dict() pset_data["spec_url"] = IFC4x3_SPEC_URL_TEMPLATE % pset_name with open(pset_path, "r", encoding="utf-8") as fi: root_xml = etree.fromstring(fi.read()) description = root_xml.find("Definition").text pset_data["description"] = self.clean_description(description) # parsing pset/qset properties data prop_data = dict() search_tag = "PropertyDef" if pset_type else "QtoDef" props = root_xml.find(search_tag + "s").findall(search_tag) for prop in props: prop_name = prop.find("Name").text prop_description = prop.find("Definition").text if not prop_description: # it could be just `` prop_description = "" prop_description = self.clean_description(prop_description) prop_data[prop_name] = {"description": prop_description} pset_data["properties"] = prop_data property_sets_dict[pset_name] = pset_data # export property sets data with open(BASE_MODULE_PATH / "schema/ifc4x3_properties.json", "w", encoding="utf-8") as fo: print(f"{len(property_sets_dict)} property sets parsed") json.dump(property_sets_dict, fo, sort_keys=True, indent=4) shutil.rmtree(pset_data_location) def run_doc_api_examples(): print("Entities (with parent entities attributes included):") print(get_entity_doc("IFC2X3", "IfcWindow")) print(get_entity_doc("IFC4", "IfcWindow")) print(get_entity_doc("IFC4X3", "IfcWindow")) print("Entity attributes (with parent entities attributes included):") print(get_attribute_doc("IFC2X3", "IfcWindow", "OwnerHistory")) print(get_attribute_doc("IFC4", "IfcWindow", "OwnerHistory")) print(get_attribute_doc("IFC4X3", "IfcWindow", "OwnerHistory")) print("Entity predefined types:") print(get_predefined_type_doc("IFC2X3", "IfcControllerType", "FLOATING")) print(get_predefined_type_doc("IFC4", "IfcControllerType", "FLOATING")) print(get_predefined_type_doc("IFC4X3", "IfcControllerType", "FLOATING")) print("Propety sets:") print(get_property_set_doc("IFC2X3", "Pset_ZoneCommon")) print(get_property_set_doc("IFC4", "Pset_ZoneCommon")) print(get_property_set_doc("IFC4X3", "Pset_ZoneCommon")) print("Propety sets attributes:") print(get_property_doc("IFC2X3", "Pset_ZoneCommon", "Category")) print(get_property_doc("IFC4", "Pset_ZoneCommon", "NetPlannedArea")) print(get_property_doc("IFC4X3", "Pset_ZoneCommon", "NetPlannedArea")) print("Types:") print(get_type_doc("IFC2X3", "IfcIsothermalMoistureCapacityMeasure")) print(get_type_doc("IFC4", "IfcDuration")) print(get_type_doc("IFC4X3", "IfcDuration")) if __name__ == "__main__": extractor = DocExtractor() extractor.extract_ifc2x3() extractor.extract_ifc4() extractor.extract_ifc4x3() # run_doc_api_examples()