# IfcOpenShell - IFC toolkit and geometry engine
# Copyright (C) 2022, 2023 @Andrej730
#
# This file is part of IfcOpenShell.
#
# IfcOpenShell is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# IfcOpenShell is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with IfcOpenShell. If not, see .
import copy
import json
from pathlib import Path
from typing import Optional, TypedDict, Union
from typing_extensions import NotRequired
import ifcopenshell
import ifcopenshell.ifcopenshell_wrapper as ifcopenshell_wrapper
import ifcopenshell.util.attribute
import ifcopenshell.util.schema
try:
import glob
import re
import shutil
import urllib.parse
import warnings
import zipfile
import requests
from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
from lxml import etree
from markdown import markdown
except:
pass # Only necessary if you're using it to generate the docs database
BASE_MODULE_PATH = Path(__file__).parent
IFC2x3_DOCS_LOCATION = BASE_MODULE_PATH / "Ifc2.3.0.1"
IFC4_DOCS_LOCATION = BASE_MODULE_PATH / "Ifc4.0.2.1"
IFC4x3_HTML_LOCATION = BASE_MODULE_PATH / "IFC4.3-html"
IFC4x3_DEV_LOCATION = BASE_MODULE_PATH / "IFC4.3.x-development"
IFC4x3_SPEC_URL_TEMPLATE = "https://ifc43-docs.standards.buildingsmart.org/IFC/RELEASE/IFC4x3/HTML/lexical/%s.htm"
class BaseData(TypedDict):
description: str
spec_url: str
class EntityData(BaseData):
attributes: NotRequired[dict[str, str]]
predefined_types: NotRequired[dict[str, str]]
class PsetData(TypedDict):
# Apparently some psets in ifc4 are missing spec url / description.
description: NotRequired[str]
spec_url: NotRequired[str]
properties: dict[str, str]
class PropertyData(TypedDict):
description: str
# in IFC4x3 there is no children[] for properties
children: NotRequired[dict[str, "PropertyData"]]
class ClassesSuggestions(TypedDict):
name: str
predefined_type: NotRequired[str]
class SchemaData(TypedDict):
entities: dict[str, EntityData]
types: dict[str, BaseData]
properties: dict[str, PsetData]
classes_suggestions: dict[str, ClassesSuggestions]
SUPPORTED_SCHEMA = ifcopenshell.util.schema.IFC_SCHEMA
SCHEMA_FILES: dict[SUPPORTED_SCHEMA, dict[str, Path]] = {
"IFC2X3": {
"entities": BASE_MODULE_PATH / "schema/ifc2x3_entities.json",
"properties": BASE_MODULE_PATH / "schema/ifc2x3_properties.json",
"types": BASE_MODULE_PATH / "schema/ifc2x3_types.json",
"classes_suggestions": BASE_MODULE_PATH / "schema/ifc_classes_suggestions.json",
},
"IFC4": {
"entities": BASE_MODULE_PATH / "schema/ifc4_entities.json",
"properties": BASE_MODULE_PATH / "schema/ifc4_properties.json",
"types": BASE_MODULE_PATH / "schema/ifc4_types.json",
"classes_suggestions": BASE_MODULE_PATH / "schema/ifc_classes_suggestions.json",
},
"IFC4X3": {
"entities": BASE_MODULE_PATH / "schema/ifc4x3_entities.json",
"properties": BASE_MODULE_PATH / "schema/ifc4x3_properties.json",
"types": BASE_MODULE_PATH / "schema/ifc4x3_types.json",
"classes_suggestions": BASE_MODULE_PATH / "schema/ifc_classes_suggestions.json",
},
}
db: dict[SUPPORTED_SCHEMA, SchemaData] = None
schema_by_name: dict[SUPPORTED_SCHEMA, Optional[ifcopenshell_wrapper.schema_definition]] = {
"IFC2X3": None,
"IFC4": None,
"IFC4X3": None,
}
def get_db(version: ifcopenshell.util.schema.IFC_SCHEMA) -> Union[SchemaData, None]:
global db
if not db:
db = {ifc_version: dict() for ifc_version in SCHEMA_FILES}
for ifc_version in SCHEMA_FILES:
for data_type in SCHEMA_FILES[ifc_version]:
schema_path = SCHEMA_FILES[ifc_version][data_type]
if not schema_path.is_file():
print(f"Schema file {schema_path} wasn't found.")
files_missing = True
continue
with open(schema_path, "r") as fi:
db[ifc_version][data_type] = json.load(fi)
version = ifcopenshell.util.schema.get_fallback_schema(version)
return db.get(version)
def get_schema_by_name(version: str) -> ifcopenshell_wrapper.schema_definition:
global schema_by_name
version = ifcopenshell.util.schema.get_fallback_schema(version)
if not schema_by_name[version]:
schema_by_name[version] = ifcopenshell.schema_by_name(version)
return schema_by_name[version]
def get_class_suggestions(
version: ifcopenshell.util.schema.IFC_SCHEMA,
class_name: str,
) -> Union[ClassesSuggestions, None]:
db = get_db(version)
if not db:
return
class_suggestions = db["classes_suggestions"].get(class_name)
return class_suggestions
def get_entity_doc(
version: ifcopenshell.util.schema.IFC_SCHEMA,
entity_name: str,
recursive: bool = True,
) -> Union[EntityData, None]:
db = get_db(version)
if db:
entity = copy.deepcopy(db["entities"].get(entity_name))
if not recursive:
return entity
ifc_schema = get_schema_by_name(version)
ifc_entity = ifc_schema.declaration_by_name(entity_name)
ifc_supertype = ifc_entity.supertype()
if ifc_supertype:
parent_entity = get_entity_doc(version, ifc_supertype.name(), recursive=True)
if "attributes" not in entity:
entity["attributes"] = dict()
for parent_attr in parent_entity.get("attributes", []):
entity["attributes"][parent_attr] = parent_entity["attributes"][parent_attr]
return entity
def get_attribute_doc(
version: ifcopenshell.util.schema.IFC_SCHEMA,
entity: str,
attribute: str,
recursive=True,
) -> Union[str, None]:
db = get_db(version)
if db:
entity_ = get_entity_doc(version, entity, recursive)
if entity_ and "attributes" in entity_:
return entity_["attributes"].get(attribute)
def get_predefined_type_doc(
version: ifcopenshell.util.schema.IFC_SCHEMA,
entity: str,
predefined_type: str,
) -> Union[str, None]:
db = get_db(version)
if db:
entity_ = db["entities"].get(entity)
if entity_:
return entity_.get("predefined_types", {}).get(predefined_type)
def get_property_set_doc(version: ifcopenshell.util.schema.IFC_SCHEMA, pset: str) -> Union[PsetData, None]:
db = get_db(version)
if db:
return db["properties"].get(pset)
def get_property_doc(version: ifcopenshell.util.schema.IFC_SCHEMA, pset: str, prop: str) -> Union[str, None]:
db = get_db(version)
if db:
pset_ = db["properties"].get(pset)
if pset_:
return pset_["properties"].get(prop)
def get_type_doc(version: ifcopenshell.util.schema.IFC_SCHEMA, ifc_type: str) -> Union[BaseData, None]:
db = get_db(version)
if db:
return db["types"].get(ifc_type)
# TODO: there are still some discrepancies between this method
# and the specs website because of the asymmetry
# More: https://github.com/buildingSMART/IFC4.3.x-development/issues/582
def get_inverse_attributes(el):
inverse_attrs = []
for a in el.all_inverse_attributes():
attribute_type = a.attribute_reference().type_of_attribute()
# unpacking aggregation types
while isinstance(attribute_type, ifcopenshell.ifcopenshell_wrapper.aggregation_type):
attribute_type = attribute_type.type_of_element()
attribute_type = attribute_type.declared_type()
# recursively looking for entities inside the selections
types_to_process = [attribute_type]
entity_attr_types = []
while types_to_process:
for attr_type in types_to_process.copy():
if isinstance(attr_type, ifcopenshell.ifcopenshell_wrapper.select_type):
types_to_process.extend([t for t in attr_type.select_list()])
else:
entity_attr_types.append(attr_type.name())
types_to_process.remove(attr_type)
if el.name() in entity_attr_types:
inverse_attrs.append(a)
return inverse_attrs
class DocExtractor:
def clean_highlighted_words(self, text: str) -> str:
text = re.sub(r"\b_([a-zA-Z0-9]+)_\b", r"\1", text)
text = re.sub(r"\*\*([a-zA-Z0-9]+)\*\*", r"\1", text)
return text
def clean_description(self, description):
description = description.replace("\n", " ")
description = description.replace("\u00a0", " ")
description = description.split("HISTORY:", 1)[0]
description = description.strip()
return description
def extract_ifc2x3(self):
print("Parsing data for Ifc2.3.0.1")
if not IFC2x3_DOCS_LOCATION.is_dir():
raise Exception(
f'Docs for IFC 2.3.0.1 expected to be in folder "{IFC2x3_DOCS_LOCATION.resolve()}\\"\n'
"For doc extraction please either setup docs as described above \n"
"or change IFC2x3_DOCS_LOCATION in the script accordingly.\n"
"You can download docs from the repository: \n"
"https://github.com/buildingSMART/IFC/tree/Ifc2.3.0.1"
)
# need to parse actual domains from the website
# since domains from github paths do not match domains from the websites
# probably due domains on the website being from 4_0
# example (property set / github domain / website domain):
# Pset_AirTerminalBoxPHistory IfcControlExtension IfcHvacDomain
self.extract_ifc2x3_property_sets_site_domains()
self.extract_ifc2x3_entities()
self.extract_ifc2x3_property_sets()
self.extract_ifc2x3_types()
def extract_ifc2x3_property_sets_site_domains(self):
property_sets_domains = dict()
r = requests.get("https://standards.buildingsmart.org/IFC/RELEASE/IFC2x3/TC1/HTML/psd/psd_index.htm")
html = BeautifulSoup(r.content, features="lxml")
for a in html.find_all("a"):
domain, pset = a["href"].removeprefix("./").removesuffix(".xml").split("/")
property_sets_domains[pset] = domain
# export property sets data
with open(BASE_MODULE_PATH / "schema/ifc2x3_property_sets_site_domains.json", "w", encoding="utf-8") as fo:
print(f"{len(property_sets_domains)} property sets domains were parsed from the website")
json.dump(property_sets_domains, fo, sort_keys=True, indent=4)
def setup_ifc2x3_reference_lookup(self):
# setup references look up tables to convert property hrefs to actual data paths
references_paths_lookup = dict()
glob_query = f"{IFC2x3_DOCS_LOCATION}/Constants/*/*"
parsed_paths = [filepath for filepath in glob.iglob(f"{IFC2x3_DOCS_LOCATION}/Properties/*/*", recursive=False)]
parsed_paths += [filepath for filepath in glob.iglob(f"{IFC2x3_DOCS_LOCATION}/Constants/*/*", recursive=False)]
for parsed_path in parsed_paths:
parsed_path = Path(parsed_path)
# all references omit "$" character, I've checked it on 2_3
# need to check it if moving to next IFC version
property_reference = parsed_path.stem.replace("$", "")
references_paths_lookup[property_reference] = parsed_path
return references_paths_lookup
def extract_ifc2x3_entities(self):
ifc2x3_references_paths_lookup = self.setup_ifc2x3_reference_lookup()
ifc4_references_paths_lookup = self.setup_ifc4_reference_lookup()
entities_dict = dict()
# search
entities_paths = [
filepath for filepath in glob.iglob(f"{IFC2x3_DOCS_LOCATION}/Sections/**/Entities", recursive=True)
]
for parse_folder_path in entities_paths:
for entity_path in glob.iglob(f"{parse_folder_path}/**/"):
entity_path = Path(entity_path)
entity_name = entity_path.stem
entities_dict[entity_name] = dict()
# utf-8-sig because of \ufeff occcurs - meaning it's utf bom encoded
md_path = entity_path / "Documentation.md"
xml_path = entity_path / "DocEntity.xml"
md_url_part = urllib.parse.quote(str(md_path.relative_to(Path(__file__).parent).as_posix()))
github_md_url = f"https://github.com/buildingSMART/IFC/blob/{md_url_part}"
with open(md_path, "r", encoding="utf-8-sig") as fi:
# convert markdown to html for easier parsing
html = markdown(fi.read())
entity_description = BeautifulSoup(html, features="lxml").find("p").text
entity_description = entity_description.replace("\n", " ")
entity_description = entity_description.replace("\u00a0", " ")
with open(xml_path, "r", encoding="utf-8") as fi:
bs_tree = BeautifulSoup(fi.read(), features="lxml")
entity_attrs = dict()
predefined_types = dict()
# temporarily disable MarkupResemblesLocatorWarning
# because BeautifulSoup wrongly assume we confused
# html code for filepath and gives warnings
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=MarkupResemblesLocatorWarning)
for html_attr in bs_tree.find_all("docattribute"):
attr_name = html_attr["name"]
if attr_name == "PredefinedType":
# get references to all predefined types
defined_type = html_attr["definedtype"]
enum_path = xml_path.parents[2] / "Types" / defined_type / "DocEnumeration.xml"
with open(enum_path, "r", encoding="utf-8") as fi:
enum_bs_tree = BeautifulSoup(fi.read(), features="lxml")
hrefs = [i["href"] for i in enum_bs_tree.find_all("docconstant")]
# iterate over list of predefined types
for href in hrefs:
# in IFC2X3 all documentation for constants is empty
# and as a temporary solution I'm trying to get constant's description from IFC4
const_path = ifc4_references_paths_lookup.get(
href, ifc2x3_references_paths_lookup[href]
)
with open(const_path, "r", encoding="utf-8") as fi:
const_bs_tree = BeautifulSoup(fi.read(), features="lxml")
const_name = const_bs_tree.find("docconstant")["name"]
description_tag = const_bs_tree.find("documentation")
const_description = "" if not description_tag else description_tag.text
predefined_types[const_name] = const_description
else:
html_description = BeautifulSoup(html_attr.text, features="lxml")
attr_description = html_description.get_text()
attr_description = attr_description.replace("\n", " ")
attr_description = attr_description.replace("\u00a0", " ")
attr_description = attr_description.replace("&npsp;", " ")
# discard part of the description with changelog
# Example:
# https://standards.buildingsmart.org/IFC/RELEASE/IFC2x3/TC1/HTML/ifcpresentationdefinitionresource/lexical/ifcannotationfillarea.htm
attr_description = attr_description.split("IFC2x Edition 3 CHANGE", 1)[0]
attr_description = attr_description.split("IFC2x Edition 2 Addendum 2 CHANGE", 1)[0]
attr_description = attr_description.split("IFC2x2 Addendum 1 change", 1)[0]
attr_description = attr_description.split("IFC2x PLATFORM CHANGE", 1)[0]
attr_description = attr_description.split("IFC2x3 CHANGE", 1)[0]
attr_description = attr_description.split("IFC2x Edition3 CHANGE", 1)[0]
attr_description = attr_description.strip().rstrip(">").strip()
entity_attrs[attr_name] = attr_description
if entity_attrs:
entities_dict[entity_name]["attributes"] = entity_attrs
if predefined_types:
entities_dict[entity_name]["predefined_types"] = predefined_types
entities_dict[entity_name]["description"] = entity_description
spec_url = (
"https://standards.buildingsmart.org/IFC/RELEASE/IFC2x3/TC1/HTML/"
f"{md_path.parents[2].name.lower()}/lexical/{entity_name.lower()}.htm"
)
entities_dict[entity_name]["spec_url"] = spec_url
# export entities data
with open(BASE_MODULE_PATH / "schema/ifc2x3_entities.json", "w", encoding="utf-8") as fo:
print(f"{len(entities_dict)} entities parsed")
json.dump(entities_dict, fo, sort_keys=True, indent=4)
def extract_ifc2x3_property_sets(self):
property_sets_dict = dict()
property_sets_references = dict()
# extract lists of properties and theirs references for each property set
parsed_paths = [
filepath for filepath in glob.iglob(f"{IFC2x3_DOCS_LOCATION}/Sections/**/PropertySets", recursive=True)
]
# prepare property sets domains from the website we extracted earlier
with open(BASE_MODULE_PATH / "schema/ifc2x3_property_sets_site_domains.json", "r") as fi:
property_sets_site_domains = json.load(fi)
for parse_folder_path in parsed_paths:
for property_set_path in glob.iglob(f"{parse_folder_path}/**/"):
property_set_path = Path(property_set_path)
property_set_name = property_set_path.stem
property_set_dict = dict()
property_references = list()
xml_path = property_set_path / "DocPropertySet.xml"
md_path = property_set_path / "Documentation.md"
if md_path.is_file():
with open(md_path, "r", encoding="utf-8-sig") as fi:
# convert markdown to html for easier parsing
html = markdown(fi.read())
property_set_description = BeautifulSoup(html, features="lxml").find("p").text
property_set_description = property_set_description.replace("\n", " ")
property_set_description = property_set_description.split("HISTORY:", 1)[0]
property_set_description = property_set_description.strip()
property_set_dict["description"] = property_set_description
else:
print(
f"WARNING. Property set {property_set_name} has no Documentation.md, "
f"property set will be left without description."
)
with open(xml_path, "r", encoding="utf-8") as fi:
bs_tree = BeautifulSoup(fi.read(), features="lxml")
for html_attr in bs_tree.find_all("docproperty"):
property_references.append(html_attr["href"])
property_sets_references[property_set_name] = property_references
property_set_domain = property_sets_site_domains[property_set_name]
spec_url = (
"https://standards.buildingsmart.org/IFC/RELEASE/IFC2x3/TC1/HTML"
f"/psd/{property_set_domain}/{property_set_name}.xml"
)
property_set_dict["spec_url"] = spec_url
property_sets_dict[property_set_name] = property_set_dict
# setup references look up tables to convert property hrefs to actual data paths
references_paths_lookup = self.setup_ifc2x3_reference_lookup()
# setup a function because we'll need to check child properties recusively
def get_property_info_by_href(href):
property_dict = dict()
property_path = references_paths_lookup[href]
md_path = property_path / "Documentation.md"
xml_path = property_path / "DocProperty.xml"
md_url_part = urllib.parse.quote(str(md_path.relative_to(Path(__file__).parent).as_posix()))
xml_url_part = urllib.parse.quote(str(xml_path.relative_to(Path(__file__).parent).as_posix()))
github_md_url = f"https://github.com/buildingSMART/IFC/blob/{md_url_part}"
github_xml_url = f"https://github.com/buildingSMART/IFC/blob/{xml_url_part}"
with open(xml_path, "r", encoding="utf-8") as fi:
bs_tree = BeautifulSoup(fi.read(), features="lxml")
tags = bs_tree.find_all("docproperty")
# check for child properties - if they are present parse their data recursively
elements_tag = bs_tree.find("elements")
if elements_tag is not None:
child_tags = elements_tag.find_all("docproperty")
child_tags_dict = dict()
for child_tag in child_tags:
child_tag_href = child_tag["href"]
child_tag_name, child_tag_dict = get_property_info_by_href(child_tag_href)
child_tags_dict[child_tag_name] = child_tag_dict
tags.remove(child_tag)
property_dict["children"] = child_tags_dict
print(f"Child nodes found inside property xml. Url: {github_xml_url}")
if len(tags) != 1:
print(
f"WARNING. Found more properties inside property xml, "
f"only the first one was parsed (number of properties: {len(tags)}). Url: {github_xml_url}."
)
property_name = tags[0]["name"]
if not md_path.is_file():
print(
f"WARNING. Property {property_name} is missing documentation.md, "
f"property will be left without description. Url: {github_xml_url}"
)
else:
with open(md_path, "r", encoding="utf-8-sig") as fi:
# convert markdown to html for easier parsing
html = markdown(fi.read())
description = BeautifulSoup(html, features="lxml").find("p").text
description = description.replace("\n", " ")
description = description.replace("\u00a0", " ")
property_dict["description"] = description
return (property_name, property_dict)
# lookup each property reference and save it's name and description
for property_set_name in property_sets_references:
properties_dict = dict()
for property_reference in property_sets_references[property_set_name]:
property_name, property_dict = get_property_info_by_href(property_reference)
properties_dict[property_name] = property_dict
property_sets_dict[property_set_name]["properties"] = properties_dict
# export property sets data
with open(BASE_MODULE_PATH / "schema/ifc2x3_properties.json", "w", encoding="utf-8") as fo:
print(f"{len(property_sets_dict)} property sets parsed")
json.dump(property_sets_dict, fo, sort_keys=True, indent=4)
def extract_ifc2x3_types(self):
types_dict = dict()
# search
types_paths = [filepath for filepath in glob.iglob(f"{IFC2x3_DOCS_LOCATION}/Sections/**/Types", recursive=True)]
for parse_folder_path in types_paths:
for type_path in glob.iglob(f"{parse_folder_path}/**/"):
type_path = Path(type_path)
type_name = type_path.stem
types_dict[type_name] = dict()
md_path = type_path / "Documentation.md"
# utf-8-sig because of \ufeff occcurs - meaning it's utf bom encoded
with open(md_path, "r", encoding="utf-8-sig") as fi:
# convert markdown to html for easier parsing
html = markdown(fi.read())
type_description = BeautifulSoup(html, features="lxml").find("p").text
type_description = type_description.replace("\n", " ")
type_description = type_description.replace("\u00a0", " ")
type_description = type_description.replace("Definition from ISO/CD 10303-46:1992: ", "")
type_description = type_description.replace("Definition from ISO/CD 10303-42:1992 ", "")
type_description = type_description.replace("Definition from ISO/CD 10303-42:1992: ", "")
type_description = type_description.replace("Definition from ISO/CD 10303-41:1992: ", "")
type_description = type_description.strip()
if type_description:
types_dict[type_name]["description"] = type_description
spec_url = (
"https://standards.buildingsmart.org/IFC/RELEASE/IFC2x3/TC1/HTML/"
f"{md_path.parents[2].name.lower()}/lexical/{type_name.lower()}.htm"
)
types_dict[type_name]["spec_url"] = spec_url
# export entities data
with open(BASE_MODULE_PATH / "schema/ifc2x3_types.json", "w", encoding="utf-8") as fo:
print(f"{len(types_dict)} ifc types parsed")
json.dump(types_dict, fo, sort_keys=True, indent=4)
def extract_ifc4(self):
print("Parsing data for Ifc4.0.2.1")
if not IFC4_DOCS_LOCATION.is_dir():
raise Exception(
f'Docs for Ifc4.0.2.1 expected to be in folder "{IFC4_DOCS_LOCATION.resolve()}\\"\n'
"For doc extraction please either setup docs as described above \n"
"or change IFC4_DOCS_LOCATION in the script accordingly.\n"
"You can download docs from the repository: \n"
"https://github.com/buildingSMART/IFC/tree/Ifc4.0.2.1"
)
# actually domains in Ifc 4.0 are consistent between website and docs
# BUT there are two property sets that site is missing and therefore they won't have spec_url
# because of them I left the site parsing too
# missed property sets:
# Pset_BuildingElementCommon Pset_ElementCommon
self.extract_ifc4_property_sets_site_domains()
self.extract_ifc4_entities()
self.extract_ifc4_property_sets()
self.extract_ifc4_types()
def extract_ifc4_property_sets_site_domains(self):
property_sets_domains = dict()
with requests.get(
"https://standards.buildingsmart.org/IFC/RELEASE/IFC4/ADD2_TC1"
"/HTML/annex/annex-b/alphabeticalorder_psets.htm"
) as r:
html = BeautifulSoup(r.content, features="lxml")
for a in html.find_all("a", {"class": "listing-link"}):
href_split = a["href"].split("/")
domain = href_split[3]
pset = href_split[5].removesuffix(".htm")
property_sets_domains[pset] = domain
with requests.get(
"https://standards.buildingsmart.org/IFC/RELEASE/IFC4/ADD2_TC1/"
"/HTML/annex/annex-b/alphabeticalorder_qsets.htm"
) as r:
html = BeautifulSoup(r.content, features="lxml")
for a in html.find_all("a", {"class": "listing-link"}):
href_split = a["href"].split("/")
domain = href_split[3]
pset = href_split[5].removesuffix(".htm")
property_sets_domains[pset] = domain
# export property sets data
with open(BASE_MODULE_PATH / "schema/ifc4_property_sets_site_domains.json", "w", encoding="utf-8") as fo:
print(f"{len(property_sets_domains)} property sets domains were parsed from the website")
json.dump(property_sets_domains, fo, sort_keys=True, indent=4)
def setup_ifc4_reference_lookup(self):
references_paths_lookup = dict()
parsed_paths = [filepath for filepath in glob.iglob(f"{IFC4_DOCS_LOCATION}/Properties/*/*", recursive=False)]
parsed_paths += [filepath for filepath in glob.iglob(f"{IFC4_DOCS_LOCATION}/Quantities/*/*", recursive=False)]
parsed_paths += [filepath for filepath in glob.iglob(f"{IFC4_DOCS_LOCATION}/Constants/*/*", recursive=False)]
for parsed_path in parsed_paths:
parsed_path = Path(parsed_path)
# all references omit "$" character, I've checked it on 4_0
# need to check it if moving to next IFC version
# btw no reason to check if all references were used in properties
# because there are also child properties
property_reference = parsed_path.stem.replace("$", "")
references_paths_lookup[property_reference] = parsed_path
return references_paths_lookup
def extract_ifc4_entities(self):
references_paths_lookup = self.setup_ifc4_reference_lookup()
entities_dict = dict()
# search
entities_paths = [
filepath for filepath in glob.iglob(f"{IFC4_DOCS_LOCATION}/Sections/**/Entities", recursive=True)
]
for parse_folder_path in entities_paths:
for entity_path in glob.iglob(f"{parse_folder_path}/**/"):
entity_path = Path(entity_path)
entity_name = entity_path.stem
entities_dict[entity_name] = dict()
md_path = entity_path / "Documentation.md"
xml_path = entity_path / "DocEntity.xml"
md_url_part = urllib.parse.quote(str(md_path.relative_to(Path(__file__).parent).as_posix()))
github_md_url = f"https://github.com/buildingSMART/IFC/blob/{md_url_part}"
# utf-8-sig because of \ufeff occcurs - meaning it's utf bom encoded
with open(md_path, "r", encoding="utf-8-sig") as fi:
# convert markdown to html for easier parsing
html = markdown(fi.read())
entity_description = BeautifulSoup(html, features="lxml").find("p").text
entity_description = entity_description.replace("\n", " ")
entity_description = entity_description.replace("\u00a0", " ")
entity_description = entity_description.replace("{ .extDef}", "")
entity_description = entity_description.strip()
with open(xml_path, "r", encoding="utf-8") as fi:
bs_tree = BeautifulSoup(fi.read(), features="lxml")
entity_attrs = dict()
predefined_types = dict()
# temporarily disable MarkupResemblesLocatorWarning
# because BeautifulSoup wrongly assume we confused
# html code for filepath and gives warnings
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=MarkupResemblesLocatorWarning)
for html_attr in bs_tree.find_all("docattribute"):
attr_name = html_attr["name"]
if attr_name == "PredefinedType":
# get references to all predefined types
defined_type = html_attr["definedtype"]
enum_path = xml_path.parents[2] / "Types" / defined_type / "DocEnumeration.xml"
with open(enum_path, "r", encoding="utf-8") as fi:
enum_bs_tree = BeautifulSoup(fi.read(), features="lxml")
hrefs = [i["href"] for i in enum_bs_tree.find_all("docconstant")]
# iterate over list of predefined types
for href in hrefs:
const_path = references_paths_lookup[href]
with open(const_path, "r", encoding="utf-8") as fi:
const_bs_tree = BeautifulSoup(fi.read(), features="lxml")
const_name = const_bs_tree.find("docconstant")["name"]
description_tag = const_bs_tree.find("documentation")
const_description = "" if not description_tag else description_tag.text
predefined_types[const_name] = const_description
else:
html_description = BeautifulSoup(html_attr.text, features="lxml")
attr_description = html_description.get_text()
attr_description = attr_description.replace("\n", " ")
attr_description = attr_description.replace("\u00a0", " ")
# discard part of the description with changelog, notes and examples etc.
# Those notes actually can be useful but we'll need a way to reformat them
# Example:
# https://standards.buildingsmart.org/IFC/RELEASE/IFC4/ADD2_TC1/HTML/schema/ifcsharedbldgelements/lexical/ifcrelconnectspathelements.htm
attr_description = attr_description.split("{ .change-ifc", 1)[0]
attr_description = attr_description.split("{ .note", 1)[0]
attr_description = attr_description.split("{ .examples", 1)[0]
attr_description = attr_description.split("{ .deprecated", 1)[0]
attr_description = attr_description.split("{ .history", 1)[0]
attr_description = attr_description.strip()
entity_attrs[attr_name] = attr_description
if entity_attrs:
entities_dict[entity_name]["attributes"] = entity_attrs
if predefined_types:
entities_dict[entity_name]["predefined_types"] = predefined_types
entities_dict[entity_name]["description"] = entity_description
spec_url = (
"https://standards.buildingsmart.org/IFC/RELEASE/IFC4/ADD2_TC1/HTML/schema/"
f"{md_path.parents[2].name.lower()}/lexical/{entity_name.lower()}.htm"
)
entities_dict[entity_name]["spec_url"] = spec_url
# entities_dict[entity_name]['github_url'] = github_md_url
# export entities data
with open(BASE_MODULE_PATH / "schema/ifc4_entities.json", "w", encoding="utf-8") as fo:
print(f"{len(entities_dict)} entities parsed")
json.dump(entities_dict, fo, sort_keys=True, indent=4)
def extract_ifc4_property_sets(self):
# function parses both property and quantity sets
property_sets_dict = dict()
property_sets_references = dict()
# extract lists of properties and theirs references for each property set
parsed_paths = [
filepath for filepath in glob.iglob(f"{IFC4_DOCS_LOCATION}/Sections/**/PropertySets", recursive=True)
]
parsed_paths += [
filepath for filepath in glob.iglob(f"{IFC4_DOCS_LOCATION}/Sections/**/QuantitySets", recursive=True)
]
# prepare property sets domains from the website we extracted earlier
with open(BASE_MODULE_PATH / "schema/ifc4_property_sets_site_domains.json", "r") as fi:
property_sets_site_domains = json.load(fi)
psets_test = set()
for parse_folder_path in parsed_paths:
for property_set_path in glob.iglob(f"{parse_folder_path}/**/"):
property_set_path = Path(property_set_path)
property_set_name = property_set_path.stem
property_set_dict = dict()
property_references = list()
property_quantity = property_set_path.parents[0].name == "QuantitySets"
xml_path = property_set_path / ("DocQuantitySet.xml" if property_quantity else "DocPropertySet.xml")
md_path = property_set_path / "Documentation.md"
if md_path.is_file():
with open(md_path, "r", encoding="utf-8-sig") as fi:
# convert markdown to html for easier parsing
html = markdown(fi.read())
property_set_description = BeautifulSoup(html, features="lxml").find("p").text
property_set_description = property_set_description.replace("\n", " ")
property_set_description = property_set_description.split("HISTORY:", 1)[0]
property_set_description = property_set_description.strip()
property_set_dict["description"] = property_set_description
else:
print(
f"WARNING. Property set {property_set_name} has no Documentation.md, "
f"property set will be left without description."
)
with open(xml_path, "r", encoding="utf-8") as fi:
bs_tree = BeautifulSoup(fi.read(), features="lxml")
for html_attr in bs_tree.find_all("docquantity" if property_quantity else "docproperty"):
property_references.append(html_attr["href"])
property_sets_references[property_set_name] = property_references
if property_set_name.lower() not in property_sets_site_domains:
print(
f"WARNING. {property_set_name} was not found on the spec website, "
"this property set won't have any spec_url in schema."
)
else:
property_set_domain = property_sets_site_domains.get(property_set_name.lower(), "")
spec_url = (
"https://standards.buildingsmart.org/IFC/RELEASE/IFC4/ADD2_TC1/HTML"
f"/schema/{property_set_domain}"
f"/{'qset' if property_quantity else 'pset'}"
f"/{property_set_name.lower()}.htm"
)
property_set_dict["spec_url"] = spec_url
property_sets_dict[property_set_name] = property_set_dict
# setup references look up tables to convert property hrefs to actual data paths
references_paths_lookup = self.setup_ifc4_reference_lookup()
# setup a function because we'll need to check child properties recusively
def get_property_info_by_href(href):
property_dict = dict()
property_path = references_paths_lookup[href]
property_quantity = property_path.parents[1].name == "Quantities"
md_path = property_path / "Documentation.md"
xml_path = property_path / ("DocQuantity.xml" if property_quantity else "DocProperty.xml")
md_url_part = urllib.parse.quote(str(md_path.relative_to(Path(__file__).parent).as_posix()))
github_md_url = f"https://github.com/buildingSMART/IFC/blob/{md_url_part}"
xml_url_part = urllib.parse.quote(str(xml_path.relative_to(Path(__file__).parent).as_posix()))
github_xml_url = f"https://github.com/buildingSMART/IFC/blob/{xml_url_part}"
with open(xml_path, "r", encoding="utf-8") as fi:
bs_tree = BeautifulSoup(fi.read(), features="lxml")
tags = bs_tree.find_all("docquantity" if property_quantity else "docproperty")
# check for child properties - if they are present parse their data recursively
elements_tag = bs_tree.find("elements")
if elements_tag is not None:
child_tags = elements_tag.find_all("docquantity" if property_quantity else "docproperty")
child_tags_dict = dict()
for child_tag in child_tags:
child_tag_href = child_tag["href"]
child_tag_name, child_tag_dict = get_property_info_by_href(child_tag_href)
child_tags_dict[child_tag_name] = child_tag_dict
tags.remove(child_tag)
property_dict["children"] = child_tags_dict
print(f"Child nodes found inside property xml. Url: {github_xml_url}")
if len(tags) != 1:
print(
f"WARNING. Found more properties inside property xml, "
f"only the first one was parsed (number of properties: {len(tags)}). Url: {github_xml_url}."
)
property_name = tags[0]["name"]
if not md_path.is_file():
print(
f"WARNING. Property {property_name} is missing documentation.md, property will be left without description. "
f"Url: {github_xml_url}"
)
else:
with open(md_path, "r", encoding="utf-8-sig") as fi:
# convert markdown to html for easier parsing
html = markdown(fi.read())
description = BeautifulSoup(html, features="lxml").find("p").text
description = description.replace("\n", " ")
description = description.replace("\u00a0", " ")
property_dict["description"] = description
return (property_name, property_dict)
# lookup each property reference and save it's name and description
for property_set_name in property_sets_references:
properties_dict = dict()
for property_reference in property_sets_references[property_set_name]:
property_name, property_dict = get_property_info_by_href(property_reference)
properties_dict[property_name] = property_dict
property_sets_dict[property_set_name]["properties"] = properties_dict
# export property sets data
with open(BASE_MODULE_PATH / "schema/ifc4_properties.json", "w", encoding="utf-8") as fo:
print(f"{len(property_sets_dict)} property sets parsed")
json.dump(property_sets_dict, fo, sort_keys=True, indent=4)
def extract_ifc4_types(self):
types_dict = dict()
# search
types_paths = [filepath for filepath in glob.iglob(f"{IFC4_DOCS_LOCATION}/Sections/**/Types", recursive=True)]
for parse_folder_path in types_paths:
for type_path in glob.iglob(f"{parse_folder_path}/**/"):
type_path = Path(type_path)
type_name = type_path.stem
types_dict[type_name] = dict()
md_path = type_path / "Documentation.md"
# utf-8-sig because of \ufeff occcurs - meaning it's utf bom encoded
with open(md_path, "r", encoding="utf-8-sig") as fi:
# convert markdown to html for easier parsing
html = markdown(fi.read().replace("{ .extDef}", ""))
type_description = BeautifulSoup(html, features="lxml").find("p").text
type_description = type_description.replace("\n", " ")
type_description = type_description.replace("\u00a0", " ")
type_description = type_description.replace("{ .extDef}", "")
type_description = type_description.replace(
"NOTE Definition according to ISO/CD 10303-41:1992 ", ""
)
type_description = type_description.replace("Definition from ISO/CD 10303-41:1992: ", "")
type_description = type_description.strip()
if type_description:
types_dict[type_name]["description"] = type_description
spec_url = (
"https://standards.buildingsmart.org/IFC/RELEASE/IFC4/ADD2_TC1/HTML/schema/"
f"{md_path.parents[2].name.lower()}/lexical/{type_name.lower()}.htm"
)
types_dict[type_name]["spec_url"] = spec_url
# export entities data
with open(BASE_MODULE_PATH / "schema/ifc4_types.json", "w", encoding="utf-8") as fo:
print(f"{len(types_dict)} ifc types parsed")
json.dump(types_dict, fo, sort_keys=True, indent=4)
def extract_ifc4x3(self):
print("Parsing data for Ifc4.3.0.1")
if not IFC4x3_DEV_LOCATION.is_dir():
raise Exception(
f'Specs development repository for Ifc4.3.0.1 expected to be in folder "{IFC4x3_DEV_LOCATION.resolve()}\\"\n'
"For doc extraction please either setup docs as described above \n"
"or change IFC4x3_DEV_LOCATION in the script accordingly.\n"
"You can download docs from the repository: \n"
"https://github.com/buildingSMART/IFC4.3.x-development"
)
if not IFC4x3_HTML_LOCATION.is_dir():
raise Exception(
f'Formal release for Ifc4.3.0.1 expected to be in folder "{IFC4x3_HTML_LOCATION.resolve()}\\"\n'
"For doc extraction please either setup docs as described above \n"
"or change IFC4x3_HTML_LOCATION in the script accordingly.\n"
"You can download docs from the repository: \n"
"https://github.com/buildingsmart/ifc4.3-html"
)
dev_code_path = IFC4x3_DEV_LOCATION / "code"
description_json_path = dev_code_path / "entities_description.json"
if not description_json_path.is_file():
shutil.copy(
BASE_MODULE_PATH / "ifc4x3dev_scrape_data_for_docs.py",
dev_code_path / "ifc4x3dev_scrape_data_for_docs.py",
)
raise Exception(
f'The entities description data expected to be located in \n"{description_json_path.resolve()}.\n'
f"To generate it `ifc4x3dev_scrape_data_for_docs.py` will be copied from current folder to \n{dev_code_path}\n"
"and you'll need to run in from `/code` folder.\nThis script will use development `server.py` "
"module to extract entities descriptions.\n\n"
"Before running it make sure you run `create_resources.sh` from `/code` folder first.\n"
"You'll need to complete at least 3 commands from `create_resources.sh`:\n"
" py extract_concepts_from_xmi.py ../schemas/IFC.xml\n"
" py to_pset.py ../schemas/IFC.xml psd\n"
" py parse_xmi.py ../schemas/IFC.xml"
)
self.extract_ifc4x3_entities()
self.extract_ifc4x3_property_sets()
def extract_ifc4x3_entities(self):
with open(IFC4x3_DEV_LOCATION / "code/entities_description.json", "r") as fi:
entities_description = json.load(fi)
entities_dict = dict()
types_dict = dict()
schema = ifcopenshell.ifcopenshell_wrapper.schema_by_name("IFC4X3_ADD2")
for entity in schema.declarations():
entity_name = entity.name()
entity_data = dict()
entity_data["spec_url"] = IFC4x3_SPEC_URL_TEMPLATE % entity_name
if entity_name not in entities_description:
print(
f"WARNING. Entity {entity_name} is not present in data parsed from DEV DOCUMENTATION "
"even though it's present in ifcopenshell schema. It's description will be left empty."
)
description = ""
else:
description = self.clean_highlighted_words(entities_description[entity_name]["description"])
entity_data["description"] = description
# types = type_declaration + enumeration_type + select_type
if not isinstance(entity, ifcopenshell.ifcopenshell_wrapper.entity):
types_dict[entity_name] = entity_data
continue
# entities processing
# assign attributes / predef types data
parsed_attributes_data = entities_description[entity_name]["attributes"]
parsed_predefined_types_data = entities_description[entity_name]["predefined_types"]
attributes_data = dict()
predefined_types = dict()
# iterate over forward and inverse entity attributes
# TODO: more eloquent way to get inverse attributes of the declaration?
for a in list(entity.attributes()) + get_inverse_attributes(entity):
attr_name = a.name()
# predefined types
if attr_name == "PredefinedType":
for v in ifcopenshell.util.attribute.get_enum_items(a):
if v not in parsed_predefined_types_data:
print(
f"WARNING. Predefined type {v} (of entity {entity_name}) is not present in data parsed from DEV DOCUMENTATION "
"even though it's present in ifcopenshell schema. It's description will be left empty."
)
description = ""
else:
description = self.clean_description(parsed_predefined_types_data[v])
predefined_types[v] = description
continue
# attributes
if attr_name not in parsed_attributes_data:
print(
f"WARNING. Attribute {attr_name} (of entity {entity_name}) is not present in data parsed from DEV DOCUMENTATION "
"even though it's present in ifcopenshell schema. It's description will be left empty."
)
description = ""
else:
description = self.clean_description(parsed_attributes_data[attr_name])
attributes_data[attr_name] = description
if attributes_data:
entity_data["attributes"] = attributes_data
if predefined_types:
entity_data["predefined_types"] = predefined_types
entities_dict[entity_name] = entity_data
# export entities data
with open(BASE_MODULE_PATH / "schema/ifc4x3_entities.json", "w", encoding="utf-8") as fo:
print(f"{len(entities_dict)} entities parsed")
json.dump(entities_dict, fo, sort_keys=True, indent=4)
# export entities data
with open(BASE_MODULE_PATH / "schema/ifc4x3_types.json", "w", encoding="utf-8") as fo:
print(f"{len(types_dict)} ifc types parsed")
json.dump(types_dict, fo, sort_keys=True, indent=4)
def extract_ifc4x3_property_sets(self):
pset_data_zip = IFC4x3_HTML_LOCATION / "IFC/RELEASE/IFC4x3/HTML/annex-a-psd.zip"
pset_data_location = BASE_MODULE_PATH / "temp/annex-a-psd"
with zipfile.ZipFile(pset_data_zip, "r") as fi_zip:
fi_zip.extractall(pset_data_location)
property_sets_dict = dict()
for pset_path in glob.iglob(f"{pset_data_location}/*.xml"):
pset_path = Path(pset_path)
pset_name = pset_path.stem
# pset / qset
pset_type = True if pset_name.split("_")[0] == "Pset" else False
pset_data = dict()
pset_data["spec_url"] = IFC4x3_SPEC_URL_TEMPLATE % pset_name
with open(pset_path, "r", encoding="utf-8") as fi:
root_xml = etree.fromstring(fi.read())
description = root_xml.find("Definition").text
pset_data["description"] = self.clean_description(description)
# parsing pset/qset properties data
prop_data = dict()
search_tag = "PropertyDef" if pset_type else "QtoDef"
props = root_xml.find(search_tag + "s").findall(search_tag)
for prop in props:
prop_name = prop.find("Name").text
prop_description = prop.find("Definition").text
if not prop_description: # it could be just ``
prop_description = ""
prop_description = self.clean_description(prop_description)
prop_data[prop_name] = {"description": prop_description}
pset_data["properties"] = prop_data
property_sets_dict[pset_name] = pset_data
# export property sets data
with open(BASE_MODULE_PATH / "schema/ifc4x3_properties.json", "w", encoding="utf-8") as fo:
print(f"{len(property_sets_dict)} property sets parsed")
json.dump(property_sets_dict, fo, sort_keys=True, indent=4)
shutil.rmtree(pset_data_location)
def run_doc_api_examples():
print("Entities (with parent entities attributes included):")
print(get_entity_doc("IFC2X3", "IfcWindow"))
print(get_entity_doc("IFC4", "IfcWindow"))
print(get_entity_doc("IFC4X3", "IfcWindow"))
print("Entity attributes (with parent entities attributes included):")
print(get_attribute_doc("IFC2X3", "IfcWindow", "OwnerHistory"))
print(get_attribute_doc("IFC4", "IfcWindow", "OwnerHistory"))
print(get_attribute_doc("IFC4X3", "IfcWindow", "OwnerHistory"))
print("Entity predefined types:")
print(get_predefined_type_doc("IFC2X3", "IfcControllerType", "FLOATING"))
print(get_predefined_type_doc("IFC4", "IfcControllerType", "FLOATING"))
print(get_predefined_type_doc("IFC4X3", "IfcControllerType", "FLOATING"))
print("Propety sets:")
print(get_property_set_doc("IFC2X3", "Pset_ZoneCommon"))
print(get_property_set_doc("IFC4", "Pset_ZoneCommon"))
print(get_property_set_doc("IFC4X3", "Pset_ZoneCommon"))
print("Propety sets attributes:")
print(get_property_doc("IFC2X3", "Pset_ZoneCommon", "Category"))
print(get_property_doc("IFC4", "Pset_ZoneCommon", "NetPlannedArea"))
print(get_property_doc("IFC4X3", "Pset_ZoneCommon", "NetPlannedArea"))
print("Types:")
print(get_type_doc("IFC2X3", "IfcIsothermalMoistureCapacityMeasure"))
print(get_type_doc("IFC4", "IfcDuration"))
print(get_type_doc("IFC4X3", "IfcDuration"))
if __name__ == "__main__":
extractor = DocExtractor()
extractor.extract_ifc2x3()
extractor.extract_ifc4()
extractor.extract_ifc4x3()
# run_doc_api_examples()