Source code for mintlemon.sentence_splitter.sentence_splitter

import re
from typing import List
from pathlib import Path

PATH = str(Path(__file__).parent.parent / "data/TR_non_breaking_prefixes.txt")

[docs]class SentenceSplitter: """ SentenceSplitter is a class used for splitting a text into sentences by considering `Turkish non-breaking prefixes <https://github.com/tnltk/tnltk/blob/main/resources/TR_non_breaking_prefixes.txt>`_ Methods: -------- split_sentences(text: str) : List[str] Split the given text into sentences by considering Turkish non-breaking prefixes. """ def __init__(self) -> None: with open(PATH, "r", encoding="utf-8") as file: self.non_breaking_prefixes_tr = file.read().splitlines() self.prefix_pattern = r"(?:^|\s)(" + "|".join(self.non_breaking_prefixes_tr) + r")\."
[docs] def split_sentences(self, text: str) -> List[str]: """ Split the given text into sentences by considering Turkish non-breaking prefixes. Parameters ---------- text : str The input text to be split into sentences. Returns ------- sentences : list A list of sentences Examples -------- >>> from mintlemon import SentenceSplitter >>> splitter = SentenceSplitter() >>> text = "Bu cümle bir örnektir. Bu cümle de bir örnektir!" >>> splitter.split_sentences(text) Output: ["Bu cümle bir örnektir.", "Bu cümle de bir örnektir!"] """ text = re.sub(self.prefix_pattern, r"\1", text) return re.split(r"(?<=[.!?])\s", text)