Source code for catatom2osm.hgwnames

"""Parsing of highway names."""
import re

from fuzzywuzzy import fuzz, process

from catatom2osm import config

MATCH_THR = 60


[docs]def normalize(text): return re.sub(r" *\(.*\)", "", (text or "").lower().strip())
[docs]def parse(name): """Transform the name of a street from Cadastre conventions to OSM ones.""" name = name.split(";")[0] # Remove additional information name = re.sub(r"[,]+", ", ", name).strip() # Avoids comma without trailing space result = [] for (i, word) in enumerate(re.split(r"[ ]+", name.strip())): nude_word = re.sub(r"^\(|\)$", "", word) # Remove enclosing parenthesis if i == 0: if word in config.excluded_types: return "" else: new_word = config.highway_types.get(word, word.title()) elif nude_word in config.lowcase_words: # Articles new_word = word.lower() elif "'" in word[1:-1]: # Articles with aphostrope left = word.split("'")[0] right = word.split("'")[-1] if left in ["C", "D", "L", "N", "S"]: new_word = left.lower() + "'" + right.title() elif right in ["S", "N", "L", "LA", "LS"]: new_word = left.title() + "'" + right.lower() else: new_word = word.title() else: new_word = word.title() new_word = new_word.replace("·L", "·l") # Letra ele geminada new_word = new_word.replace(".L", "·l") # Letra ele geminada result.append(new_word) return " ".join(result).strip()
[docs]def match(name, choices): """ Fuzzy search best match for string name in iterable choices. If the result is not good enough returns the name parsed. Args: name (str): String to look for choices (list): Iterable with choices """ parsed_name = parse(name) if fuzz and parsed_name: normalized = [normalize(c) for c in choices] try: matching = process.extractOne( normalize(parsed_name), normalized, scorer=fuzz.token_sort_ratio ) if matching and matching[1] > MATCH_THR: return choices[normalized.index(matching[0])], "OSM" except RuntimeError: pass return parsed_name, "CAT"
[docs]def dsmatch(name, dataset, fn): """ Fuzzy search best matching object for string name in dataset. Args: name (str): String to look for dataset (list): List of objects to search for fn (function): Function to obtain a string from a element of the dataset Returns: First element with the maximun fuzzy ratio. """ max_ratio = 0 matching = None for e in dataset: if fuzz and name: ratio = fuzz.token_sort_ratio(normalize(name), normalize(fn(e))) if ratio > max_ratio: max_ratio = ratio matching = e elif normalize(name) == normalize(fn(e)): matching = e break return matching