# IfcOpenShell - IFC toolkit and geometry engine
# Copyright (C) 2022, 2023 @Andrej730
#
# This file is part of IfcOpenShell.
#
# IfcOpenShell is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# IfcOpenShell is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with IfcOpenShell.  If not, see <http://www.gnu.org/licenses/>.

import copy
import json
from pathlib import Path
from typing import Optional, TypedDict, Union

from typing_extensions import NotRequired

import ifcopenshell
import ifcopenshell.ifcopenshell_wrapper as ifcopenshell_wrapper
import ifcopenshell.util.attribute
import ifcopenshell.util.schema

try:
    import glob
    import re
    import shutil
    import urllib.parse
    import warnings
    import zipfile

    import requests
    from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
    from lxml import etree
    from markdown import markdown
except:
    pass  # Only necessary if you're using it to generate the docs database


BASE_MODULE_PATH = Path(__file__).parent
IFC2x3_DOCS_LOCATION = BASE_MODULE_PATH / "Ifc2.3.0.1"
IFC4_DOCS_LOCATION = BASE_MODULE_PATH / "Ifc4.0.2.1"

IFC4x3_HTML_LOCATION = BASE_MODULE_PATH / "IFC4.3-html"
IFC4x3_DEV_LOCATION = BASE_MODULE_PATH / "IFC4.3.x-development"
IFC4x3_SPEC_URL_TEMPLATE = "https://ifc43-docs.standards.buildingsmart.org/IFC/RELEASE/IFC4x3/HTML/lexical/%s.htm"


class BaseData(TypedDict):
    description: str
    spec_url: str


class EntityData(BaseData):
    attributes: NotRequired[dict[str, str]]
    predefined_types: NotRequired[dict[str, str]]


class PsetData(TypedDict):
    # Apparently some psets in ifc4 are missing spec url / description.
    description: NotRequired[str]
    spec_url: NotRequired[str]
    properties: dict[str, str]


class PropertyData(TypedDict):
    description: str
    # in IFC4x3 there is no children[] for properties
    children: NotRequired[dict[str, "PropertyData"]]


class ClassesSuggestions(TypedDict):
    name: str
    predefined_type: NotRequired[str]


class SchemaData(TypedDict):
    entities: dict[str, EntityData]
    types: dict[str, BaseData]
    properties: dict[str, PsetData]
    classes_suggestions: dict[str, ClassesSuggestions]


SUPPORTED_SCHEMA = ifcopenshell.util.schema.IFC_SCHEMA
SCHEMA_FILES: dict[SUPPORTED_SCHEMA, dict[str, Path]] = {
    "IFC2X3": {
        "entities": BASE_MODULE_PATH / "schema/ifc2x3_entities.json",
        "properties": BASE_MODULE_PATH / "schema/ifc2x3_properties.json",
        "types": BASE_MODULE_PATH / "schema/ifc2x3_types.json",
        "classes_suggestions": BASE_MODULE_PATH / "schema/ifc_classes_suggestions.json",
    },
    "IFC4": {
        "entities": BASE_MODULE_PATH / "schema/ifc4_entities.json",
        "properties": BASE_MODULE_PATH / "schema/ifc4_properties.json",
        "types": BASE_MODULE_PATH / "schema/ifc4_types.json",
        "classes_suggestions": BASE_MODULE_PATH / "schema/ifc_classes_suggestions.json",
    },
    "IFC4X3": {
        "entities": BASE_MODULE_PATH / "schema/ifc4x3_entities.json",
        "properties": BASE_MODULE_PATH / "schema/ifc4x3_properties.json",
        "types": BASE_MODULE_PATH / "schema/ifc4x3_types.json",
        "classes_suggestions": BASE_MODULE_PATH / "schema/ifc_classes_suggestions.json",
    },
}

db: dict[SUPPORTED_SCHEMA, SchemaData] = None
schema_by_name: dict[SUPPORTED_SCHEMA, Optional[ifcopenshell_wrapper.schema_definition]] = {
    "IFC2X3": None,
    "IFC4": None,
    "IFC4X3": None,
}


def get_db(version: ifcopenshell.util.schema.IFC_SCHEMA) -> Union[SchemaData, None]:
    global db
    if not db:
        db = {ifc_version: dict() for ifc_version in SCHEMA_FILES}
        for ifc_version in SCHEMA_FILES:
            for data_type in SCHEMA_FILES[ifc_version]:
                schema_path = SCHEMA_FILES[ifc_version][data_type]
                if not schema_path.is_file():
                    print(f"Schema file {schema_path} wasn't found.")
                    files_missing = True
                    continue

                with open(schema_path, "r") as fi:
                    db[ifc_version][data_type] = json.load(fi)

    version = ifcopenshell.util.schema.get_fallback_schema(version)
    return db.get(version)


def get_schema_by_name(version: str) -> ifcopenshell_wrapper.schema_definition:
    global schema_by_name
    version = ifcopenshell.util.schema.get_fallback_schema(version)
    if not schema_by_name[version]:
        schema_by_name[version] = ifcopenshell.schema_by_name(version)
    return schema_by_name[version]


def get_class_suggestions(
    version: ifcopenshell.util.schema.IFC_SCHEMA,
    class_name: str,
) -> Union[ClassesSuggestions, None]:
    db = get_db(version)
    if not db:
        return
    class_suggestions = db["classes_suggestions"].get(class_name)
    return class_suggestions


def get_entity_doc(
    version: ifcopenshell.util.schema.IFC_SCHEMA,
    entity_name: str,
    recursive: bool = True,
) -> Union[EntityData, None]:
    db = get_db(version)
    if db:
        entity = copy.deepcopy(db["entities"].get(entity_name))
        if not recursive:
            return entity

        ifc_schema = get_schema_by_name(version)
        ifc_entity = ifc_schema.declaration_by_name(entity_name)
        ifc_supertype = ifc_entity.supertype()
        if ifc_supertype:
            parent_entity = get_entity_doc(version, ifc_supertype.name(), recursive=True)
            if "attributes" not in entity:
                entity["attributes"] = dict()
            for parent_attr in parent_entity.get("attributes", []):
                entity["attributes"][parent_attr] = parent_entity["attributes"][parent_attr]
        return entity


def get_attribute_doc(
    version: ifcopenshell.util.schema.IFC_SCHEMA,
    entity: str,
    attribute: str,
    recursive=True,
) -> Union[str, None]:
    db = get_db(version)
    if db:
        entity_ = get_entity_doc(version, entity, recursive)
        if entity_ and "attributes" in entity_:
            return entity_["attributes"].get(attribute)


def get_predefined_type_doc(
    version: ifcopenshell.util.schema.IFC_SCHEMA,
    entity: str,
    predefined_type: str,
) -> Union[str, None]:
    db = get_db(version)
    if db:
        entity_ = db["entities"].get(entity)
        if entity_:
            return entity_.get("predefined_types", {}).get(predefined_type)


def get_property_set_doc(version: ifcopenshell.util.schema.IFC_SCHEMA, pset: str) -> Union[PsetData, None]:
    db = get_db(version)
    if db:
        return db["properties"].get(pset)


def get_property_doc(version: ifcopenshell.util.schema.IFC_SCHEMA, pset: str, prop: str) -> Union[str, None]:
    db = get_db(version)
    if db:
        pset_ = db["properties"].get(pset)
        if pset_:
            return pset_["properties"].get(prop)


def get_type_doc(version: ifcopenshell.util.schema.IFC_SCHEMA, ifc_type: str) -> Union[BaseData, None]:
    db = get_db(version)
    if db:
        return db["types"].get(ifc_type)


# TODO: there are still some discrepancies between this method
# and the specs website because of the asymmetry
# More: https://github.com/buildingSMART/IFC4.3.x-development/issues/582
def get_inverse_attributes(el):
    inverse_attrs = []
    for a in el.all_inverse_attributes():
        attribute_type = a.attribute_reference().type_of_attribute()
        # unpacking aggregation types
        while isinstance(attribute_type, ifcopenshell.ifcopenshell_wrapper.aggregation_type):
            attribute_type = attribute_type.type_of_element()
        attribute_type = attribute_type.declared_type()

        # recursively looking for entities inside the selections
        types_to_process = [attribute_type]
        entity_attr_types = []
        while types_to_process:
            for attr_type in types_to_process.copy():
                if isinstance(attr_type, ifcopenshell.ifcopenshell_wrapper.select_type):
                    types_to_process.extend([t for t in attr_type.select_list()])
                else:
                    entity_attr_types.append(attr_type.name())
                types_to_process.remove(attr_type)

        if el.name() in entity_attr_types:
            inverse_attrs.append(a)
    return inverse_attrs


class DocExtractor:
    def clean_highlighted_words(self, text: str) -> str:
        text = re.sub(r"\b_([a-zA-Z0-9]+)_\b", r"\1", text)
        text = re.sub(r"\*\*([a-zA-Z0-9]+)\*\*", r"\1", text)
        return text

    def clean_description(self, description):
        description = description.replace("\n", " ")
        description = description.replace("\u00a0", " ")
        description = description.split("HISTORY:", 1)[0]
        description = description.strip()
        return description

    def extract_ifc2x3(self):
        print("Parsing data for Ifc2.3.0.1")
        if not IFC2x3_DOCS_LOCATION.is_dir():
            raise Exception(
                f'Docs for IFC 2.3.0.1 expected to be in folder "{IFC2x3_DOCS_LOCATION.resolve()}\\"\n'
                "For doc extraction please either setup docs as described above \n"
                "or change IFC2x3_DOCS_LOCATION in the script accordingly.\n"
                "You can download docs from the repository: \n"
                "https://github.com/buildingSMART/IFC/tree/Ifc2.3.0.1"
            )

        # need to parse actual domains from the website
        # since domains from github paths do not match domains from the websites
        # probably due domains on the website being from 4_0
        # example (property set / github domain / website domain):
        # Pset_AirTerminalBoxPHistory IfcControlExtension IfcHvacDomain
        self.extract_ifc2x3_property_sets_site_domains()
        self.extract_ifc2x3_entities()
        self.extract_ifc2x3_property_sets()
        self.extract_ifc2x3_types()

    def extract_ifc2x3_property_sets_site_domains(self):
        property_sets_domains = dict()
        r = requests.get("https://standards.buildingsmart.org/IFC/RELEASE/IFC2x3/TC1/HTML/psd/psd_index.htm")
        html = BeautifulSoup(r.content, features="lxml")
        for a in html.find_all("a"):
            domain, pset = a["href"].removeprefix("./").removesuffix(".xml").split("/")
            property_sets_domains[pset] = domain

        # export property sets data
        with open(BASE_MODULE_PATH / "schema/ifc2x3_property_sets_site_domains.json", "w", encoding="utf-8") as fo:
            print(f"{len(property_sets_domains)} property sets domains were parsed from the website")
            json.dump(property_sets_domains, fo, sort_keys=True, indent=4)

    def setup_ifc2x3_reference_lookup(self):
        # setup references look up tables to convert property hrefs to actual data paths
        references_paths_lookup = dict()
        glob_query = f"{IFC2x3_DOCS_LOCATION}/Constants/*/*"
        parsed_paths = [filepath for filepath in glob.iglob(f"{IFC2x3_DOCS_LOCATION}/Properties/*/*", recursive=False)]
        parsed_paths += [filepath for filepath in glob.iglob(f"{IFC2x3_DOCS_LOCATION}/Constants/*/*", recursive=False)]
        for parsed_path in parsed_paths:
            parsed_path = Path(parsed_path)
            # all references omit "$" character, I've checked it on 2_3
            # need to check it if moving to next IFC version
            property_reference = parsed_path.stem.replace("$", "")
            references_paths_lookup[property_reference] = parsed_path
        return references_paths_lookup

    def extract_ifc2x3_entities(self):
        ifc2x3_references_paths_lookup = self.setup_ifc2x3_reference_lookup()
        ifc4_references_paths_lookup = self.setup_ifc4_reference_lookup()
        entities_dict = dict()

        # search
        entities_paths = [
            filepath for filepath in glob.iglob(f"{IFC2x3_DOCS_LOCATION}/Sections/**/Entities", recursive=True)
        ]
        for parse_folder_path in entities_paths:
            for entity_path in glob.iglob(f"{parse_folder_path}/**/"):
                entity_path = Path(entity_path)
                entity_name = entity_path.stem
                entities_dict[entity_name] = dict()

                # utf-8-sig because of \ufeff occcurs - meaning it's utf bom encoded
                md_path = entity_path / "Documentation.md"
                xml_path = entity_path / "DocEntity.xml"
                md_url_part = urllib.parse.quote(str(md_path.relative_to(Path(__file__).parent).as_posix()))
                github_md_url = f"https://github.com/buildingSMART/IFC/blob/{md_url_part}"

                with open(md_path, "r", encoding="utf-8-sig") as fi:
                    # convert markdown to html for easier parsing
                    html = markdown(fi.read())
                    entity_description = BeautifulSoup(html, features="lxml").find("p").text
                    entity_description = entity_description.replace("\n", " ")
                    entity_description = entity_description.replace("\u00a0", " ")

                with open(xml_path, "r", encoding="utf-8") as fi:
                    bs_tree = BeautifulSoup(fi.read(), features="lxml")

                entity_attrs = dict()
                predefined_types = dict()
                # temporarily disable MarkupResemblesLocatorWarning
                # because BeautifulSoup wrongly assume we confused
                # html code for filepath and gives warnings
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore", category=MarkupResemblesLocatorWarning)

                    for html_attr in bs_tree.find_all("docattribute"):
                        attr_name = html_attr["name"]
                        if attr_name == "PredefinedType":
                            # get references to all predefined types
                            defined_type = html_attr["definedtype"]
                            enum_path = xml_path.parents[2] / "Types" / defined_type / "DocEnumeration.xml"
                            with open(enum_path, "r", encoding="utf-8") as fi:
                                enum_bs_tree = BeautifulSoup(fi.read(), features="lxml")
                            hrefs = [i["href"] for i in enum_bs_tree.find_all("docconstant")]

                            # iterate over list of predefined types
                            for href in hrefs:
                                # in IFC2X3 all documentation for constants is empty
                                # and as a temporary solution I'm trying to get constant's description from IFC4
                                const_path = ifc4_references_paths_lookup.get(
                                    href, ifc2x3_references_paths_lookup[href]
                                )
                                with open(const_path, "r", encoding="utf-8") as fi:
                                    const_bs_tree = BeautifulSoup(fi.read(), features="lxml")
                                const_name = const_bs_tree.find("docconstant")["name"]
                                description_tag = const_bs_tree.find("documentation")
                                const_description = "" if not description_tag else description_tag.text
                                predefined_types[const_name] = const_description

                        else:
                            html_description = BeautifulSoup(html_attr.text, features="lxml")
                            attr_description = html_description.get_text()

                            attr_description = attr_description.replace("\n", " ")
                            attr_description = attr_description.replace("\u00a0", " ")
                            attr_description = attr_description.replace("&npsp;", " ")

                            # discard part of the description with changelog
                            # Example:
                            # https://standards.buildingsmart.org/IFC/RELEASE/IFC2x3/TC1/HTML/ifcpresentationdefinitionresource/lexical/ifcannotationfillarea.htm
                            attr_description = attr_description.split("IFC2x Edition 3 CHANGE", 1)[0]
                            attr_description = attr_description.split("IFC2x Edition 2 Addendum 2 CHANGE", 1)[0]
                            attr_description = attr_description.split("IFC2x2 Addendum 1 change", 1)[0]
                            attr_description = attr_description.split("IFC2x PLATFORM CHANGE", 1)[0]
                            attr_description = attr_description.split("IFC2x3 CHANGE", 1)[0]
                            attr_description = attr_description.split("IFC2x Edition3 CHANGE", 1)[0]

                            attr_description = attr_description.strip().rstrip(">").strip()
                            entity_attrs[attr_name] = attr_description

                if entity_attrs:
                    entities_dict[entity_name]["attributes"] = entity_attrs

                if predefined_types:
                    entities_dict[entity_name]["predefined_types"] = predefined_types

                entities_dict[entity_name]["description"] = entity_description
                spec_url = (
                    "https://standards.buildingsmart.org/IFC/RELEASE/IFC2x3/TC1/HTML/"
                    f"{md_path.parents[2].name.lower()}/lexical/{entity_name.lower()}.htm"
                )
                entities_dict[entity_name]["spec_url"] = spec_url

        # export entities data
        with open(BASE_MODULE_PATH / "schema/ifc2x3_entities.json", "w", encoding="utf-8") as fo:
            print(f"{len(entities_dict)} entities parsed")
            json.dump(entities_dict, fo, sort_keys=True, indent=4)

    def extract_ifc2x3_property_sets(self):
        property_sets_dict = dict()
        property_sets_references = dict()

        # extract lists of properties and theirs references for each property set
        parsed_paths = [
            filepath for filepath in glob.iglob(f"{IFC2x3_DOCS_LOCATION}/Sections/**/PropertySets", recursive=True)
        ]

        # prepare property sets domains from the website we extracted earlier
        with open(BASE_MODULE_PATH / "schema/ifc2x3_property_sets_site_domains.json", "r") as fi:
            property_sets_site_domains = json.load(fi)

        for parse_folder_path in parsed_paths:
            for property_set_path in glob.iglob(f"{parse_folder_path}/**/"):
                property_set_path = Path(property_set_path)
                property_set_name = property_set_path.stem
                property_set_dict = dict()

                property_references = list()
                xml_path = property_set_path / "DocPropertySet.xml"
                md_path = property_set_path / "Documentation.md"

                if md_path.is_file():
                    with open(md_path, "r", encoding="utf-8-sig") as fi:
                        # convert markdown to html for easier parsing
                        html = markdown(fi.read())
                        property_set_description = BeautifulSoup(html, features="lxml").find("p").text
                        property_set_description = property_set_description.replace("\n", " ")
                        property_set_description = property_set_description.split("HISTORY:", 1)[0]
                        property_set_description = property_set_description.strip()
                        property_set_dict["description"] = property_set_description
                else:
                    print(
                        f"WARNING. Property set {property_set_name} has no Documentation.md, "
                        f"property set will be left without description."
                    )

                with open(xml_path, "r", encoding="utf-8") as fi:
                    bs_tree = BeautifulSoup(fi.read(), features="lxml")
                    for html_attr in bs_tree.find_all("docproperty"):
                        property_references.append(html_attr["href"])

                property_sets_references[property_set_name] = property_references
                property_set_domain = property_sets_site_domains[property_set_name]
                spec_url = (
                    "https://standards.buildingsmart.org/IFC/RELEASE/IFC2x3/TC1/HTML"
                    f"/psd/{property_set_domain}/{property_set_name}.xml"
                )
                property_set_dict["spec_url"] = spec_url
                property_sets_dict[property_set_name] = property_set_dict

        # setup references look up tables to convert property hrefs to actual data paths
        references_paths_lookup = self.setup_ifc2x3_reference_lookup()

        # setup a function because we'll need to check child properties recusively
        def get_property_info_by_href(href):
            property_dict = dict()
            property_path = references_paths_lookup[href]

            md_path = property_path / "Documentation.md"
            xml_path = property_path / "DocProperty.xml"
            md_url_part = urllib.parse.quote(str(md_path.relative_to(Path(__file__).parent).as_posix()))
            xml_url_part = urllib.parse.quote(str(xml_path.relative_to(Path(__file__).parent).as_posix()))
            github_md_url = f"https://github.com/buildingSMART/IFC/blob/{md_url_part}"
            github_xml_url = f"https://github.com/buildingSMART/IFC/blob/{xml_url_part}"

            with open(xml_path, "r", encoding="utf-8") as fi:
                bs_tree = BeautifulSoup(fi.read(), features="lxml")
                tags = bs_tree.find_all("docproperty")

                # check for child properties - if they are present parse their data recursively
                elements_tag = bs_tree.find("elements")
                if elements_tag is not None:
                    child_tags = elements_tag.find_all("docproperty")
                    child_tags_dict = dict()

                    for child_tag in child_tags:
                        child_tag_href = child_tag["href"]
                        child_tag_name, child_tag_dict = get_property_info_by_href(child_tag_href)
                        child_tags_dict[child_tag_name] = child_tag_dict
                        tags.remove(child_tag)
                    property_dict["children"] = child_tags_dict
                    print(f"Child nodes found inside property xml. Url: {github_xml_url}")

                if len(tags) != 1:
                    print(
                        f"WARNING. Found more properties inside property xml, "
                        f"only the first one was parsed (number of properties: {len(tags)}). Url: {github_xml_url}."
                    )
                property_name = tags[0]["name"]

            if not md_path.is_file():
                print(
                    f"WARNING. Property {property_name} is missing documentation.md, "
                    f"property will be left without description. Url: {github_xml_url}"
                )
            else:
                with open(md_path, "r", encoding="utf-8-sig") as fi:
                    # convert markdown to html for easier parsing
                    html = markdown(fi.read())
                    description = BeautifulSoup(html, features="lxml").find("p").text
                    description = description.replace("\n", " ")
                    description = description.replace("\u00a0", " ")
                    property_dict["description"] = description
            return (property_name, property_dict)

        # lookup each property reference and save it's name and description
        for property_set_name in property_sets_references:
            properties_dict = dict()
            for property_reference in property_sets_references[property_set_name]:
                property_name, property_dict = get_property_info_by_href(property_reference)
                properties_dict[property_name] = property_dict
            property_sets_dict[property_set_name]["properties"] = properties_dict

        # export property sets data
        with open(BASE_MODULE_PATH / "schema/ifc2x3_properties.json", "w", encoding="utf-8") as fo:
            print(f"{len(property_sets_dict)} property sets parsed")
            json.dump(property_sets_dict, fo, sort_keys=True, indent=4)

    def extract_ifc2x3_types(self):
        types_dict = dict()
        # search
        types_paths = [filepath for filepath in glob.iglob(f"{IFC2x3_DOCS_LOCATION}/Sections/**/Types", recursive=True)]
        for parse_folder_path in types_paths:
            for type_path in glob.iglob(f"{parse_folder_path}/**/"):
                type_path = Path(type_path)
                type_name = type_path.stem
                types_dict[type_name] = dict()
                md_path = type_path / "Documentation.md"

                # utf-8-sig because of \ufeff occcurs - meaning it's utf bom encoded
                with open(md_path, "r", encoding="utf-8-sig") as fi:
                    # convert markdown to html for easier parsing
                    html = markdown(fi.read())
                    type_description = BeautifulSoup(html, features="lxml").find("p").text
                    type_description = type_description.replace("\n", " ")
                    type_description = type_description.replace("\u00a0", " ")
                    type_description = type_description.replace("Definition from ISO/CD 10303-46:1992: ", "")
                    type_description = type_description.replace("Definition from ISO/CD 10303-42:1992 ", "")
                    type_description = type_description.replace("Definition from ISO/CD 10303-42:1992: ", "")
                    type_description = type_description.replace("Definition from ISO/CD 10303-41:1992: ", "")

                    type_description = type_description.strip()

                if type_description:
                    types_dict[type_name]["description"] = type_description

                spec_url = (
                    "https://standards.buildingsmart.org/IFC/RELEASE/IFC2x3/TC1/HTML/"
                    f"{md_path.parents[2].name.lower()}/lexical/{type_name.lower()}.htm"
                )
                types_dict[type_name]["spec_url"] = spec_url

        # export entities data
        with open(BASE_MODULE_PATH / "schema/ifc2x3_types.json", "w", encoding="utf-8") as fo:
            print(f"{len(types_dict)} ifc types parsed")
            json.dump(types_dict, fo, sort_keys=True, indent=4)

    def extract_ifc4(self):
        print("Parsing data for Ifc4.0.2.1")
        if not IFC4_DOCS_LOCATION.is_dir():
            raise Exception(
                f'Docs for Ifc4.0.2.1 expected to be in folder "{IFC4_DOCS_LOCATION.resolve()}\\"\n'
                "For doc extraction please either setup docs as described above \n"
                "or change IFC4_DOCS_LOCATION in the script accordingly.\n"
                "You can download docs from the repository: \n"
                "https://github.com/buildingSMART/IFC/tree/Ifc4.0.2.1"
            )

        # actually domains in Ifc 4.0 are consistent between website and docs
        # BUT there are two property sets that site is missing and therefore they won't have spec_url
        # because of them I left the site parsing too
        # missed property sets:
        # Pset_BuildingElementCommon Pset_ElementCommon
        self.extract_ifc4_property_sets_site_domains()
        self.extract_ifc4_entities()
        self.extract_ifc4_property_sets()
        self.extract_ifc4_types()

    def extract_ifc4_property_sets_site_domains(self):
        property_sets_domains = dict()
        with requests.get(
            "https://standards.buildingsmart.org/IFC/RELEASE/IFC4/ADD2_TC1"
            "/HTML/annex/annex-b/alphabeticalorder_psets.htm"
        ) as r:
            html = BeautifulSoup(r.content, features="lxml")
            for a in html.find_all("a", {"class": "listing-link"}):
                href_split = a["href"].split("/")
                domain = href_split[3]
                pset = href_split[5].removesuffix(".htm")
                property_sets_domains[pset] = domain

        with requests.get(
            "https://standards.buildingsmart.org/IFC/RELEASE/IFC4/ADD2_TC1/"
            "/HTML/annex/annex-b/alphabeticalorder_qsets.htm"
        ) as r:
            html = BeautifulSoup(r.content, features="lxml")
            for a in html.find_all("a", {"class": "listing-link"}):
                href_split = a["href"].split("/")
                domain = href_split[3]
                pset = href_split[5].removesuffix(".htm")
                property_sets_domains[pset] = domain

        # export property sets data
        with open(BASE_MODULE_PATH / "schema/ifc4_property_sets_site_domains.json", "w", encoding="utf-8") as fo:
            print(f"{len(property_sets_domains)} property sets domains were parsed from the website")
            json.dump(property_sets_domains, fo, sort_keys=True, indent=4)

    def setup_ifc4_reference_lookup(self):
        references_paths_lookup = dict()
        parsed_paths = [filepath for filepath in glob.iglob(f"{IFC4_DOCS_LOCATION}/Properties/*/*", recursive=False)]
        parsed_paths += [filepath for filepath in glob.iglob(f"{IFC4_DOCS_LOCATION}/Quantities/*/*", recursive=False)]
        parsed_paths += [filepath for filepath in glob.iglob(f"{IFC4_DOCS_LOCATION}/Constants/*/*", recursive=False)]
        for parsed_path in parsed_paths:
            parsed_path = Path(parsed_path)
            # all references omit "$" character, I've checked it on 4_0
            # need to check it if moving to next IFC version
            # btw no reason to check if all references were used in properties
            # because there are also child properties
            property_reference = parsed_path.stem.replace("$", "")
            references_paths_lookup[property_reference] = parsed_path
        return references_paths_lookup

    def extract_ifc4_entities(self):
        references_paths_lookup = self.setup_ifc4_reference_lookup()
        entities_dict = dict()

        # search
        entities_paths = [
            filepath for filepath in glob.iglob(f"{IFC4_DOCS_LOCATION}/Sections/**/Entities", recursive=True)
        ]
        for parse_folder_path in entities_paths:
            for entity_path in glob.iglob(f"{parse_folder_path}/**/"):
                entity_path = Path(entity_path)
                entity_name = entity_path.stem
                entities_dict[entity_name] = dict()

                md_path = entity_path / "Documentation.md"
                xml_path = entity_path / "DocEntity.xml"
                md_url_part = urllib.parse.quote(str(md_path.relative_to(Path(__file__).parent).as_posix()))
                github_md_url = f"https://github.com/buildingSMART/IFC/blob/{md_url_part}"

                # utf-8-sig because of \ufeff occcurs - meaning it's utf bom encoded
                with open(md_path, "r", encoding="utf-8-sig") as fi:
                    # convert markdown to html for easier parsing
                    html = markdown(fi.read())
                    entity_description = BeautifulSoup(html, features="lxml").find("p").text
                    entity_description = entity_description.replace("\n", " ")
                    entity_description = entity_description.replace("\u00a0", " ")
                    entity_description = entity_description.replace("{ .extDef}", "")
                    entity_description = entity_description.strip()

                with open(xml_path, "r", encoding="utf-8") as fi:
                    bs_tree = BeautifulSoup(fi.read(), features="lxml")

                entity_attrs = dict()
                predefined_types = dict()
                # temporarily disable MarkupResemblesLocatorWarning
                # because BeautifulSoup wrongly assume we confused
                # html code for filepath and gives warnings
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore", category=MarkupResemblesLocatorWarning)

                    for html_attr in bs_tree.find_all("docattribute"):
                        attr_name = html_attr["name"]
                        if attr_name == "PredefinedType":
                            # get references to all predefined types
                            defined_type = html_attr["definedtype"]
                            enum_path = xml_path.parents[2] / "Types" / defined_type / "DocEnumeration.xml"
                            with open(enum_path, "r", encoding="utf-8") as fi:
                                enum_bs_tree = BeautifulSoup(fi.read(), features="lxml")
                            hrefs = [i["href"] for i in enum_bs_tree.find_all("docconstant")]

                            # iterate over list of predefined types
                            for href in hrefs:
                                const_path = references_paths_lookup[href]
                                with open(const_path, "r", encoding="utf-8") as fi:
                                    const_bs_tree = BeautifulSoup(fi.read(), features="lxml")
                                const_name = const_bs_tree.find("docconstant")["name"]
                                description_tag = const_bs_tree.find("documentation")
                                const_description = "" if not description_tag else description_tag.text
                                predefined_types[const_name] = const_description
                        else:
                            html_description = BeautifulSoup(html_attr.text, features="lxml")
                            attr_description = html_description.get_text()
                            attr_description = attr_description.replace("\n", " ")
                            attr_description = attr_description.replace("\u00a0", " ")

                            # discard part of the description with changelog, notes and examples etc.
                            # Those notes actually can be useful but we'll need a way to reformat them
                            # Example:
                            # https://standards.buildingsmart.org/IFC/RELEASE/IFC4/ADD2_TC1/HTML/schema/ifcsharedbldgelements/lexical/ifcrelconnectspathelements.htm
                            attr_description = attr_description.split("{ .change-ifc", 1)[0]
                            attr_description = attr_description.split("{ .note", 1)[0]
                            attr_description = attr_description.split("{ .examples", 1)[0]
                            attr_description = attr_description.split("{ .deprecated", 1)[0]
                            attr_description = attr_description.split("{ .history", 1)[0]

                            attr_description = attr_description.strip()
                            entity_attrs[attr_name] = attr_description

                if entity_attrs:
                    entities_dict[entity_name]["attributes"] = entity_attrs

                if predefined_types:
                    entities_dict[entity_name]["predefined_types"] = predefined_types

                entities_dict[entity_name]["description"] = entity_description
                spec_url = (
                    "https://standards.buildingsmart.org/IFC/RELEASE/IFC4/ADD2_TC1/HTML/schema/"
                    f"{md_path.parents[2].name.lower()}/lexical/{entity_name.lower()}.htm"
                )
                entities_dict[entity_name]["spec_url"] = spec_url
                # entities_dict[entity_name]['github_url'] = github_md_url

        # export entities data
        with open(BASE_MODULE_PATH / "schema/ifc4_entities.json", "w", encoding="utf-8") as fo:
            print(f"{len(entities_dict)} entities parsed")
            json.dump(entities_dict, fo, sort_keys=True, indent=4)

    def extract_ifc4_property_sets(self):
        # function parses both property and quantity sets
        property_sets_dict = dict()
        property_sets_references = dict()

        # extract lists of properties and theirs references for each property set
        parsed_paths = [
            filepath for filepath in glob.iglob(f"{IFC4_DOCS_LOCATION}/Sections/**/PropertySets", recursive=True)
        ]
        parsed_paths += [
            filepath for filepath in glob.iglob(f"{IFC4_DOCS_LOCATION}/Sections/**/QuantitySets", recursive=True)
        ]

        # prepare property sets domains from the website we extracted earlier
        with open(BASE_MODULE_PATH / "schema/ifc4_property_sets_site_domains.json", "r") as fi:
            property_sets_site_domains = json.load(fi)

        psets_test = set()
        for parse_folder_path in parsed_paths:
            for property_set_path in glob.iglob(f"{parse_folder_path}/**/"):
                property_set_path = Path(property_set_path)
                property_set_name = property_set_path.stem
                property_set_dict = dict()

                property_references = list()
                property_quantity = property_set_path.parents[0].name == "QuantitySets"
                xml_path = property_set_path / ("DocQuantitySet.xml" if property_quantity else "DocPropertySet.xml")
                md_path = property_set_path / "Documentation.md"

                if md_path.is_file():
                    with open(md_path, "r", encoding="utf-8-sig") as fi:
                        # convert markdown to html for easier parsing
                        html = markdown(fi.read())
                        property_set_description = BeautifulSoup(html, features="lxml").find("p").text
                        property_set_description = property_set_description.replace("\n", " ")
                        property_set_description = property_set_description.split("HISTORY:", 1)[0]
                        property_set_description = property_set_description.strip()
                        property_set_dict["description"] = property_set_description
                else:
                    print(
                        f"WARNING. Property set {property_set_name} has no Documentation.md, "
                        f"property set will be left without description."
                    )

                with open(xml_path, "r", encoding="utf-8") as fi:
                    bs_tree = BeautifulSoup(fi.read(), features="lxml")
                    for html_attr in bs_tree.find_all("docquantity" if property_quantity else "docproperty"):
                        property_references.append(html_attr["href"])

                property_sets_references[property_set_name] = property_references

                if property_set_name.lower() not in property_sets_site_domains:
                    print(
                        f"WARNING. {property_set_name} was not found on the spec website, "
                        "this property set won't have any spec_url in schema."
                    )
                else:
                    property_set_domain = property_sets_site_domains.get(property_set_name.lower(), "")
                    spec_url = (
                        "https://standards.buildingsmart.org/IFC/RELEASE/IFC4/ADD2_TC1/HTML"
                        f"/schema/{property_set_domain}"
                        f"/{'qset' if property_quantity else 'pset'}"
                        f"/{property_set_name.lower()}.htm"
                    )
                    property_set_dict["spec_url"] = spec_url
                property_sets_dict[property_set_name] = property_set_dict

        # setup references look up tables to convert property hrefs to actual data paths
        references_paths_lookup = self.setup_ifc4_reference_lookup()

        # setup a function because we'll need to check child properties recusively
        def get_property_info_by_href(href):
            property_dict = dict()
            property_path = references_paths_lookup[href]

            property_quantity = property_path.parents[1].name == "Quantities"

            md_path = property_path / "Documentation.md"
            xml_path = property_path / ("DocQuantity.xml" if property_quantity else "DocProperty.xml")
            md_url_part = urllib.parse.quote(str(md_path.relative_to(Path(__file__).parent).as_posix()))
            github_md_url = f"https://github.com/buildingSMART/IFC/blob/{md_url_part}"
            xml_url_part = urllib.parse.quote(str(xml_path.relative_to(Path(__file__).parent).as_posix()))
            github_xml_url = f"https://github.com/buildingSMART/IFC/blob/{xml_url_part}"

            with open(xml_path, "r", encoding="utf-8") as fi:
                bs_tree = BeautifulSoup(fi.read(), features="lxml")
                tags = bs_tree.find_all("docquantity" if property_quantity else "docproperty")

                # check for child properties - if they are present parse their data recursively
                elements_tag = bs_tree.find("elements")
                if elements_tag is not None:
                    child_tags = elements_tag.find_all("docquantity" if property_quantity else "docproperty")
                    child_tags_dict = dict()

                    for child_tag in child_tags:
                        child_tag_href = child_tag["href"]
                        child_tag_name, child_tag_dict = get_property_info_by_href(child_tag_href)
                        child_tags_dict[child_tag_name] = child_tag_dict
                        tags.remove(child_tag)
                    property_dict["children"] = child_tags_dict
                    print(f"Child nodes found inside property xml. Url: {github_xml_url}")

                if len(tags) != 1:
                    print(
                        f"WARNING. Found more properties inside property xml, "
                        f"only the first one was parsed (number of properties: {len(tags)}). Url: {github_xml_url}."
                    )
                property_name = tags[0]["name"]

            if not md_path.is_file():
                print(
                    f"WARNING. Property {property_name} is missing documentation.md, property will be left without description. "
                    f"Url: {github_xml_url}"
                )
            else:
                with open(md_path, "r", encoding="utf-8-sig") as fi:
                    # convert markdown to html for easier parsing
                    html = markdown(fi.read())
                    description = BeautifulSoup(html, features="lxml").find("p").text
                    description = description.replace("\n", " ")
                    description = description.replace("\u00a0", " ")
                    property_dict["description"] = description
            return (property_name, property_dict)

        # lookup each property reference and save it's name and description
        for property_set_name in property_sets_references:
            properties_dict = dict()
            for property_reference in property_sets_references[property_set_name]:
                property_name, property_dict = get_property_info_by_href(property_reference)
                properties_dict[property_name] = property_dict
            property_sets_dict[property_set_name]["properties"] = properties_dict

        # export property sets data
        with open(BASE_MODULE_PATH / "schema/ifc4_properties.json", "w", encoding="utf-8") as fo:
            print(f"{len(property_sets_dict)} property sets parsed")
            json.dump(property_sets_dict, fo, sort_keys=True, indent=4)

    def extract_ifc4_types(self):
        types_dict = dict()
        # search
        types_paths = [filepath for filepath in glob.iglob(f"{IFC4_DOCS_LOCATION}/Sections/**/Types", recursive=True)]
        for parse_folder_path in types_paths:
            for type_path in glob.iglob(f"{parse_folder_path}/**/"):
                type_path = Path(type_path)
                type_name = type_path.stem
                types_dict[type_name] = dict()
                md_path = type_path / "Documentation.md"

                # utf-8-sig because of \ufeff occcurs - meaning it's utf bom encoded
                with open(md_path, "r", encoding="utf-8-sig") as fi:
                    # convert markdown to html for easier parsing
                    html = markdown(fi.read().replace("{ .extDef}", ""))
                    type_description = BeautifulSoup(html, features="lxml").find("p").text
                    type_description = type_description.replace("\n", " ")
                    type_description = type_description.replace("\u00a0", " ")
                    type_description = type_description.replace("{ .extDef}", "")
                    type_description = type_description.replace(
                        "NOTE  Definition according to ISO/CD 10303-41:1992 ", ""
                    )
                    type_description = type_description.replace("Definition from ISO/CD 10303-41:1992: ", "")

                    type_description = type_description.strip()

                if type_description:
                    types_dict[type_name]["description"] = type_description

                spec_url = (
                    "https://standards.buildingsmart.org/IFC/RELEASE/IFC4/ADD2_TC1/HTML/schema/"
                    f"{md_path.parents[2].name.lower()}/lexical/{type_name.lower()}.htm"
                )
                types_dict[type_name]["spec_url"] = spec_url

        # export entities data
        with open(BASE_MODULE_PATH / "schema/ifc4_types.json", "w", encoding="utf-8") as fo:
            print(f"{len(types_dict)} ifc types parsed")
            json.dump(types_dict, fo, sort_keys=True, indent=4)

    def extract_ifc4x3(self):
        print("Parsing data for Ifc4.3.0.1")
        if not IFC4x3_DEV_LOCATION.is_dir():
            raise Exception(
                f'Specs development repository for Ifc4.3.0.1 expected to be in folder "{IFC4x3_DEV_LOCATION.resolve()}\\"\n'
                "For doc extraction please either setup docs as described above \n"
                "or change IFC4x3_DEV_LOCATION in the script accordingly.\n"
                "You can download docs from the repository: \n"
                "https://github.com/buildingSMART/IFC4.3.x-development"
            )
        if not IFC4x3_HTML_LOCATION.is_dir():
            raise Exception(
                f'Formal release for Ifc4.3.0.1 expected to be in folder "{IFC4x3_HTML_LOCATION.resolve()}\\"\n'
                "For doc extraction please either setup docs as described above \n"
                "or change IFC4x3_HTML_LOCATION in the script accordingly.\n"
                "You can download docs from the repository: \n"
                "https://github.com/buildingsmart/ifc4.3-html"
            )
        dev_code_path = IFC4x3_DEV_LOCATION / "code"
        description_json_path = dev_code_path / "entities_description.json"
        if not description_json_path.is_file():
            shutil.copy(
                BASE_MODULE_PATH / "ifc4x3dev_scrape_data_for_docs.py",
                dev_code_path / "ifc4x3dev_scrape_data_for_docs.py",
            )
            raise Exception(
                f'The entities description data expected to be located in \n"{description_json_path.resolve()}.\n'
                f"To generate it `ifc4x3dev_scrape_data_for_docs.py` will be copied from current folder to \n{dev_code_path}\n"
                "and you'll need to run in from `/code` folder.\nThis script will use development `server.py` "
                "module to extract entities descriptions.\n\n"
                "Before running it make sure you run `create_resources.sh` from `/code` folder first.\n"
                "You'll need to complete at least 3 commands from `create_resources.sh`:\n"
                "   py extract_concepts_from_xmi.py ../schemas/IFC.xml\n"
                "   py to_pset.py ../schemas/IFC.xml psd\n"
                "   py parse_xmi.py ../schemas/IFC.xml"
            )

        self.extract_ifc4x3_entities()
        self.extract_ifc4x3_property_sets()

    def extract_ifc4x3_entities(self):
        with open(IFC4x3_DEV_LOCATION / "code/entities_description.json", "r") as fi:
            entities_description = json.load(fi)

        entities_dict = dict()
        types_dict = dict()
        schema = ifcopenshell.ifcopenshell_wrapper.schema_by_name("IFC4X3_ADD2")

        for entity in schema.declarations():
            entity_name = entity.name()

            entity_data = dict()
            entity_data["spec_url"] = IFC4x3_SPEC_URL_TEMPLATE % entity_name

            if entity_name not in entities_description:
                print(
                    f"WARNING. Entity {entity_name} is not present in data parsed from DEV DOCUMENTATION "
                    "even though it's present in ifcopenshell schema. It's description will be left empty."
                )
                description = ""
            else:
                description = self.clean_highlighted_words(entities_description[entity_name]["description"])
            entity_data["description"] = description

            # types = type_declaration + enumeration_type + select_type
            if not isinstance(entity, ifcopenshell.ifcopenshell_wrapper.entity):
                types_dict[entity_name] = entity_data
                continue

            # entities processing
            # assign attributes / predef types data
            parsed_attributes_data = entities_description[entity_name]["attributes"]
            parsed_predefined_types_data = entities_description[entity_name]["predefined_types"]
            attributes_data = dict()
            predefined_types = dict()

            # iterate over forward and inverse entity attributes
            # TODO: more eloquent way to get inverse attributes of the declaration?
            for a in list(entity.attributes()) + get_inverse_attributes(entity):
                attr_name = a.name()
                # predefined types
                if attr_name == "PredefinedType":
                    for v in ifcopenshell.util.attribute.get_enum_items(a):
                        if v not in parsed_predefined_types_data:
                            print(
                                f"WARNING. Predefined type {v} (of entity {entity_name}) is not present in data parsed from DEV DOCUMENTATION "
                                "even though it's present in ifcopenshell schema. It's description will be left empty."
                            )
                            description = ""
                        else:
                            description = self.clean_description(parsed_predefined_types_data[v])
                        predefined_types[v] = description
                    continue

                # attributes
                if attr_name not in parsed_attributes_data:
                    print(
                        f"WARNING. Attribute {attr_name} (of entity {entity_name}) is not present in data parsed from DEV DOCUMENTATION "
                        "even though it's present in ifcopenshell schema. It's description will be left empty."
                    )
                    description = ""
                else:
                    description = self.clean_description(parsed_attributes_data[attr_name])
                attributes_data[attr_name] = description

            if attributes_data:
                entity_data["attributes"] = attributes_data
            if predefined_types:
                entity_data["predefined_types"] = predefined_types

            entities_dict[entity_name] = entity_data

        # export entities data
        with open(BASE_MODULE_PATH / "schema/ifc4x3_entities.json", "w", encoding="utf-8") as fo:
            print(f"{len(entities_dict)} entities parsed")
            json.dump(entities_dict, fo, sort_keys=True, indent=4)

        # export entities data
        with open(BASE_MODULE_PATH / "schema/ifc4x3_types.json", "w", encoding="utf-8") as fo:
            print(f"{len(types_dict)} ifc types parsed")
            json.dump(types_dict, fo, sort_keys=True, indent=4)

    def extract_ifc4x3_property_sets(self):
        pset_data_zip = IFC4x3_HTML_LOCATION / "IFC/RELEASE/IFC4x3/HTML/annex-a-psd.zip"
        pset_data_location = BASE_MODULE_PATH / "temp/annex-a-psd"
        with zipfile.ZipFile(pset_data_zip, "r") as fi_zip:
            fi_zip.extractall(pset_data_location)

        property_sets_dict = dict()

        for pset_path in glob.iglob(f"{pset_data_location}/*.xml"):
            pset_path = Path(pset_path)
            pset_name = pset_path.stem

            # pset / qset
            pset_type = True if pset_name.split("_")[0] == "Pset" else False

            pset_data = dict()
            pset_data["spec_url"] = IFC4x3_SPEC_URL_TEMPLATE % pset_name

            with open(pset_path, "r", encoding="utf-8") as fi:
                root_xml = etree.fromstring(fi.read())

            description = root_xml.find("Definition").text
            pset_data["description"] = self.clean_description(description)

            # parsing pset/qset properties data
            prop_data = dict()
            search_tag = "PropertyDef" if pset_type else "QtoDef"
            props = root_xml.find(search_tag + "s").findall(search_tag)
            for prop in props:
                prop_name = prop.find("Name").text
                prop_description = prop.find("Definition").text
                if not prop_description:  # it could be just `<Definition/>`
                    prop_description = ""
                prop_description = self.clean_description(prop_description)
                prop_data[prop_name] = {"description": prop_description}

            pset_data["properties"] = prop_data
            property_sets_dict[pset_name] = pset_data

        # export property sets data
        with open(BASE_MODULE_PATH / "schema/ifc4x3_properties.json", "w", encoding="utf-8") as fo:
            print(f"{len(property_sets_dict)} property sets parsed")
            json.dump(property_sets_dict, fo, sort_keys=True, indent=4)

        shutil.rmtree(pset_data_location)


def run_doc_api_examples():
    print("Entities (with parent entities attributes included):")
    print(get_entity_doc("IFC2X3", "IfcWindow"))
    print(get_entity_doc("IFC4", "IfcWindow"))
    print(get_entity_doc("IFC4X3", "IfcWindow"))

    print("Entity attributes (with parent entities attributes included):")
    print(get_attribute_doc("IFC2X3", "IfcWindow", "OwnerHistory"))
    print(get_attribute_doc("IFC4", "IfcWindow", "OwnerHistory"))
    print(get_attribute_doc("IFC4X3", "IfcWindow", "OwnerHistory"))

    print("Entity predefined types:")
    print(get_predefined_type_doc("IFC2X3", "IfcControllerType", "FLOATING"))
    print(get_predefined_type_doc("IFC4", "IfcControllerType", "FLOATING"))
    print(get_predefined_type_doc("IFC4X3", "IfcControllerType", "FLOATING"))

    print("Propety sets:")
    print(get_property_set_doc("IFC2X3", "Pset_ZoneCommon"))
    print(get_property_set_doc("IFC4", "Pset_ZoneCommon"))
    print(get_property_set_doc("IFC4X3", "Pset_ZoneCommon"))

    print("Propety sets attributes:")
    print(get_property_doc("IFC2X3", "Pset_ZoneCommon", "Category"))
    print(get_property_doc("IFC4", "Pset_ZoneCommon", "NetPlannedArea"))
    print(get_property_doc("IFC4X3", "Pset_ZoneCommon", "NetPlannedArea"))

    print("Types:")
    print(get_type_doc("IFC2X3", "IfcIsothermalMoistureCapacityMeasure"))
    print(get_type_doc("IFC4", "IfcDuration"))
    print(get_type_doc("IFC4X3", "IfcDuration"))


if __name__ == "__main__":
    extractor = DocExtractor()
    extractor.extract_ifc2x3()
    extractor.extract_ifc4()
    extractor.extract_ifc4x3()

    # run_doc_api_examples()