package sec.bdc.tm.hte.eu.preprocessing.bnlp.normalize;

import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;

/* loaded from: classes49.dex */
public class StandardTextNormalizer implements TextNormalizer {
    private static final String WHITESPACES = "\\t\\n\\r\\f\\u000B\\u0085\\u00A0\\u1680\\u2000\\u2001\\u2002\\u2003\\u2004\\u2005\\u2006\\u2007\\u2008\\u2009\\u200A\\u2028\\u2029\\u202F\\u205F\\u3000";
    private static final Pattern WHITESPACES_PATTERN = Pattern.compile("[\\t\\n\\r\\f\\u000B\\u0085\\u00A0\\u1680\\u2000\\u2001\\u2002\\u2003\\u2004\\u2005\\u2006\\u2007\\u2008\\u2009\\u200A\\u2028\\u2029\\u202F\\u205F\\u3000\\u0020]{2,}|[\\t\\n\\r\\f\\u000B\\u0085\\u00A0\\u1680\\u2000\\u2001\\u2002\\u2003\\u2004\\u2005\\u2006\\u2007\\u2008\\u2009\\u200A\\u2028\\u2029\\u202F\\u205F\\u3000]");
    private static final Pattern WHITESPACES_HEAD_TRIMMING_PATTERN = Pattern.compile("^[\\t\\n\\r\\f\\u000B\\u0085\\u00A0\\u1680\\u2000\\u2001\\u2002\\u2003\\u2004\\u2005\\u2006\\u2007\\u2008\\u2009\\u200A\\u2028\\u2029\\u202F\\u205F\\u3000 ]+");
    private static final List<CharSubstitution> ONE_TO_ONE_SUBSTITUTION_DATA = Arrays.asList(new CharSubstitution("`´ʹʻʼʾʿˋˊ˴‘’‚‛", '\''), new CharSubstitution("ʺ˝ˮ˵˶“”„‟", '\"'), new CharSubstitution("‒–—―", '-'), new CharSubstitution("⟨", '<'), new CharSubstitution("⟩", '>'));

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: classes49.dex */
    public static class CharSubstitution {
        private final char[] from;
        private final char to;

        public CharSubstitution(String str, char c) {
            this.from = str.toCharArray();
            this.to = c;
        }

        public char[] getFrom() {
            return this.from;
        }

        public char getTo() {
            return this.to;
        }
    }

    /* loaded from: classes49.dex */
    private static class StatefulTextNormalizer implements TextNormalizer {
        private String normalizedText;
        private PositionTranslator positionTranslator;

        private StatefulTextNormalizer() {
        }

        private void normalizeOneToOne() {
            for (CharSubstitution charSubstitution : StandardTextNormalizer.ONE_TO_ONE_SUBSTITUTION_DATA) {
                for (char c : charSubstitution.getFrom()) {
                    this.normalizedText = this.normalizedText.replace(c, charSubstitution.getTo());
                }
            }
        }

        private void normalizeWhitespaces() {
            Matcher matcher = StandardTextNormalizer.WHITESPACES_PATTERN.matcher(this.normalizedText);
            StringBuffer stringBuffer = new StringBuffer(this.normalizedText.length());
            while (matcher.find()) {
                matcher.appendReplacement(stringBuffer, StringUtils.SPACE);
                int start = matcher.start();
                int end = matcher.end() - 1;
                if (start != end) {
                    this.positionTranslator.registerOffsetChange(start, end, stringBuffer.length() - 1, stringBuffer.length() - 1);
                }
            }
            matcher.appendTail(stringBuffer);
            this.normalizedText = stringBuffer.toString();
        }

        private void trimHead() {
            Matcher matcher = StandardTextNormalizer.WHITESPACES_HEAD_TRIMMING_PATTERN.matcher(this.normalizedText);
            if (matcher.find()) {
                int end = matcher.end();
                this.normalizedText = this.normalizedText.substring(end);
                this.positionTranslator.setGlobalOffset(end);
            }
        }

        @Override // sec.bdc.tm.hte.eu.preprocessing.bnlp.normalize.TextNormalizer
        public NormalizedTextImpl normalize(String str) {
            this.normalizedText = str;
            this.positionTranslator = new PositionTranslator();
            trimHead();
            normalizeOneToOne();
            normalizeWhitespaces();
            return new NormalizedTextImpl(str, this.normalizedText, this.positionTranslator);
        }
    }

    @Override // sec.bdc.tm.hte.eu.preprocessing.bnlp.normalize.TextNormalizer
    public NormalizedText normalize(String str) {
        return new StatefulTextNormalizer().normalize(str);
    }
}
