Source code for catatom2osm.hgwnames

"""Parsing of highway names."""
import re

from fuzzywuzzy import fuzz, process

from catatom2osm import config

MATCH_THR = 60


[docs]def normalize(text):
    return re.sub(r" *\(.*\)", "", (text or "").lower().strip())


[docs]def parse(name):
    """Transform the name of a street from Cadastre conventions to OSM ones."""
    name = name.split(";")[0]  # Remove additional information
    name = re.sub(r"[,]+", ", ", name).strip()  # Avoids comma without trailing space
    result = []
    for (i, word) in enumerate(re.split(r"[ ]+", name.strip())):
        nude_word = re.sub(r"^\(|\)$", "", word)  # Remove enclosing parenthesis
        if i == 0:
            if word in config.excluded_types:
                return ""
            else:
                new_word = config.highway_types.get(word, word.title())
        elif nude_word in config.lowcase_words:  # Articles
            new_word = word.lower()
        elif "'" in word[1:-1]:  # Articles with aphostrope
            left = word.split("'")[0]
            right = word.split("'")[-1]
            if left in ["C", "D", "L", "N", "S"]:
                new_word = left.lower() + "'" + right.title()
            elif right in ["S", "N", "L", "LA", "LS"]:
                new_word = left.title() + "'" + right.lower()
            else:
                new_word = word.title()
        else:
            new_word = word.title()
        new_word = new_word.replace("·L", "·l")  # Letra ele geminada
        new_word = new_word.replace(".L", "·l")  # Letra ele geminada
        result.append(new_word)
    return " ".join(result).strip()


[docs]def match(name, choices):
    """
    Fuzzy search best match for string name in iterable choices.

    If the result is not good enough returns the name parsed.

    Args:
        name (str): String to look for
        choices (list): Iterable with choices
    """
    parsed_name = parse(name)
    if fuzz and parsed_name:
        normalized = [normalize(c) for c in choices]
        try:
            matching = process.extractOne(
                normalize(parsed_name), normalized, scorer=fuzz.token_sort_ratio
            )
            if matching and matching[1] > MATCH_THR:
                return choices[normalized.index(matching[0])], "OSM"
        except RuntimeError:
            pass
    return parsed_name, "CAT"


[docs]def dsmatch(name, dataset, fn):
    """
    Fuzzy search best matching object for string name in dataset.

    Args:
        name (str): String to look for
        dataset (list): List of objects to search for
        fn (function): Function to obtain a string from a element of the dataset

    Returns:
        First element with the maximun fuzzy ratio.
    """
    max_ratio = 0
    matching = None
    for e in dataset:
        if fuzz and name:
            ratio = fuzz.token_sort_ratio(normalize(name), normalize(fn(e)))
            if ratio > max_ratio:
                max_ratio = ratio
                matching = e
        elif normalize(name) == normalize(fn(e)):
            matching = e
            break
    return matching