Talksmith/preprocessing.py at main · Jovinull/Talksmith · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
from __future__ import annotations

import unicodedata
from functools import lru_cache
from typing import Iterable

import spacy

_NLP = None


def get_nlp():
    """
    Carrega spaCy UMA vez e reaproveita.
    Importante: não desabilitar tagger/morph, porque isso afeta a lematização.
    """
    global _NLP
    if _NLP is None:
        try:
            _NLP = spacy.load("pt_core_news_sm", disable=["parser", "ner"])
        except OSError as e:
            raise RuntimeError(
                "Modelo spaCy 'pt_core_news_sm' não encontrado. "
                "Instale com: python -m spacy download pt_core_news_sm"
            ) from e
    return _NLP


def remove_accents(text: str) -> str:
    nfkd = unicodedata.normalize("NFKD", text or "")
    return "".join(c for c in nfkd if not unicodedata.combining(c))


@lru_cache(maxsize=4096)
def normalize(text: str) -> tuple[str, ...]:
    """
    Normaliza um texto em tokens lematizados (tupla para cache).
    - lower
    - remove acentos
    - remove stopwords
    - mantém apenas palavras (is_alpha)
    """
    txt = remove_accents(text).lower()
    nlp = get_nlp()
    doc = nlp(txt)

    out: list[str] = []
    for tok in doc:
        if not tok.is_alpha:
            continue
        if tok.is_stop:
            continue
        lemma = (tok.lemma_ or "").strip()
        if lemma:
            out.append(lemma)
    return tuple(out)


def normalize_join(text: str) -> str:
    return " ".join(normalize(text))


def normalize_many(texts: Iterable[str], batch_size: int = 64) -> list[list[str]]:
    """
    Normaliza vários textos usando nlp.pipe (bem mais rápido no corpus).
    """
    nlp = get_nlp()
    cleaned = (remove_accents(t).lower() for t in texts)

    results: list[list[str]] = []
    for doc in nlp.pipe(cleaned, batch_size=batch_size):
        toks: list[str] = []
        for tok in doc:
            if tok.is_alpha and (not tok.is_stop):
                lemma = (tok.lemma_ or "").strip()
                if lemma:
                    toks.append(lemma)
        results.append(toks)

    return results