kglids / kg_governor / knowledge_graph_construction / src / utils / utils.py
utils.py
Raw
import re
from camelsplit import camelsplit
import pandas as pd


class Label:

    def __init__(self, text: str, lan: str):
        self.text = text
        self.lan = lan

    def get_text(self) -> str:
        return self.text

    def get_lan(self) -> str:
        return self.lan

    def __repr__(self):
        return "\"" + self.text + "\"" + "@" + self.lan

    def __str__(self):
        return str(self.__repr__())


# TODO: [Implement] Serialize as Turtle-star (needs RDFlib-star see: https://github.com/RDFLib/rdflib/discussions/1554)
class RDFResource:
    def __init__(self, content, namespace=None, isBlank=False):
        self.content = content
        self.isBlank = isBlank
        self.namespace = namespace

    def __repr__(self):
        if self.namespace:
            return "<{}{}>".format(self.namespace, self.content)

        if self.isBlank:
            return '_:{}'.format(self.content)
        if isinstance(self.content, str):
            return '"{}"'.format(self.content)
        if pd.isnull(self.content):
            return '"NaN"^^xsd:double'
        if isinstance(self.content, float):
            return round(self.content, 3)
        if isinstance(self.content, Label):
            return str(self.content)
        
        return self.content

    def __str__(self):
        return str(self.__repr__())


class Triplet:
    # TODO: [Refactor] Rename to RDFTriple
    def __init__(self, rdf_subject, rdf_predicate, rdf_object):
        self.rdf_subject = rdf_subject
        self.rdf_predicate = rdf_predicate
        self.rdf_object = rdf_object

    def __repr__(self):
        return self.__repr_helper(True)

    def __repr_helper(self, isRoot):
        formattedSubject = self.rdf_subject
        formattedObject = self.rdf_object
        if isinstance(self.rdf_subject, Triplet):
            formattedSubject = '<<{}>>'.format(self.rdf_subject.__repr_helper(False))
        if isinstance(self.rdf_object, Triplet):
            formattedObject = '<<{}>>'.format(self.rdf_object.__repr_helper(False))

        if isRoot:
            return '{} {} {}.'.format(formattedSubject, self.rdf_predicate, formattedObject)
        else:
            return '{} {} {}'.format(formattedSubject, self.rdf_predicate, formattedObject)

    def __str__(self):
        return str(self.__repr__())

    def get_reversed_triple(self):
        # TODO: [Refactor] a better name for this method?
        # The reverse of an RDF-star triple, is the reverse of the subject
        if isinstance(self.rdf_subject, Triplet):
            return Triplet(self.rdf_subject.get_reversed_triple(), self.rdf_predicate, self.rdf_object)
        
        return Triplet(self.rdf_object, self.rdf_predicate, self.rdf_subject)
    

def generate_label(col_name: str, lan: str) -> Label:
    # TODO: [Implement] the way labels are generated is not 100% the best. It is not always best to split by camel case
    if '.csv' in col_name:
        col_name = re.sub('.csv', '', col_name)
    col_name = re.sub('[^0-9a-zA-Z]+', ' ', col_name)
    text = " ".join(camelsplit(col_name.strip()))
    text = re.sub('\s+', ' ', text.strip())
    return Label(text.lower(), lan)