package com.genesys.roberta.tokenizer;

import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.concurrent.atomic.AtomicReference;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.LongStream;
import lombok.NonNull;

/* loaded from: input_file:com/genesys/roberta/tokenizer/RobertaTokenizer.class */
public class RobertaTokenizer implements Tokenizer {
    public static final long PAD_TOKEN = 1;
    public static final long CLS_TOKEN = 0;
    public static final long SEP_TOKEN = 2;
    public static final long UNK_TOKEN = 3;
    private static final Pattern PATTERN = Pattern.compile("'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+");
    private final RobertaTokenizerResourcesFactory robertaTokenizerFactory;
    private final AtomicReference<RobertaTokenizerResources> robertaResourcesCache = new AtomicReference<>();
    private final BytePairEncoder bytePairEncoder;

    public RobertaTokenizer(@NonNull RobertaTokenizerResourcesFactory robertaTokenizerResourcesFactory) {
        if (robertaTokenizerResourcesFactory == null) {
            throw new NullPointerException("robertaTokenizerResourcesFactory is marked non-null but is null");
        }
        this.robertaTokenizerFactory = robertaTokenizerResourcesFactory;
        this.bytePairEncoder = new BytePairEncoder();
    }

    @Override // com.genesys.roberta.tokenizer.Tokenizer
    public long[] tokenize(@NonNull String str) {
        if (str == null) {
            throw new NullPointerException("sentence is marked non-null but is null");
        }
        return encode(str, getRobertaTokenizerResources());
    }

    private RobertaTokenizerResources getRobertaTokenizerResources() {
        if (this.robertaResourcesCache.get() == null) {
            this.robertaResourcesCache.compareAndSet(null, this.robertaTokenizerFactory.create());
        }
        return this.robertaResourcesCache.get();
    }

    private long[] encode(@NonNull String str, @NonNull RobertaTokenizerResources robertaTokenizerResources) {
        if (str == null) {
            throw new NullPointerException("text is marked non-null but is null");
        }
        if (robertaTokenizerResources == null) {
            throw new NullPointerException("robertaResources is marked non-null but is null");
        }
        ArrayList arrayList = new ArrayList();
        Matcher matcher = PATTERN.matcher(str);
        while (matcher.find()) {
            String group = matcher.group();
            StringBuilder sb = new StringBuilder();
            for (byte b : group.getBytes(StandardCharsets.UTF_8)) {
                sb.append(this.robertaResourcesCache.get().encodeByte(b));
            }
            arrayList.add(sb.toString());
        }
        return LongStream.concat(LongStream.concat(LongStream.of(0L), arrayList.stream().map(str2 -> {
            return this.bytePairEncoder.encode(str2, robertaTokenizerResources);
        }).flatMapToLong(list -> {
            return list.stream().mapToLong(str3 -> {
                return this.robertaResourcesCache.get().encodeWord(str3, 3L).longValue();
            });
        })), LongStream.of(2L)).toArray();
    }
}
