diff --git a/code/tokenizer.py b/code/tokenizer.py new file mode 100644 index 0000000..34d5b16 --- /dev/null +++ b/code/tokenizer.py @@ -0,0 +1,64 @@ +#! pip install tokenizers + +from tokenizers import Tokenizer +from tokenizers.models import BPE +from tokenizers.trainers import BpeTrainer +from tokenizers.pre_tokenizers import Whitespace +from tokenizers.processors import BertProcessing +from tokenizers import normalizers +from tokenizers.normalizers import NFD, StripAccents, Lowercase +from tokenizers import pre_tokenizers +from tokenizers.pre_tokenizers import Digits + +# Load data from files +#paths = [str(x) for x in Path("F:/dockerfile-model/dockerfile_data_1/").glob("*.txt")] +import pymysql +#建立数据库连接 + +def getSQLData(table_name): + con = pymysql.connect(host='localhost', user='root', passwd='1234',db='xxx') + cur = con.cursor() + #用cursor中的execute 使用DDL语句创建一个名为 STUDENT 的表,指定表的字段以及字段类型 + cur.execute("SELECT content_valid from %s;"%table_name) + rows = cur.fetchall() + return rows + +text = [] +table_names = ['dockerfile'] +for table in table_names: + rows = getSQLData(table) + for row in rows: + #print(row[0].replace('\n',' ').strip()) + text.append(row[0].replace('\n',' ').strip()) + +# Initialize a tokenizer +normalizer = normalizers.Sequence([ + NFD(), # Unicode正规化 + StripAccents()] # remove accents +) + +tokenizer = Tokenizer(BPE(unk_token="")) + +pre_tokenizer = pre_tokenizers.Sequence([ + Whitespace(), + Digits(individual_digits=False) +]) + +tokenizer.pre_tokenizer = pre_tokenizer + +tokenizer.normalizer = normalizer # 更新到tokenizer里 +trainer = BpeTrainer(special_tokens=["", "", "", "", ""],vocab_size=50000, show_progress=True, min_frequency=2) +# Customize training +tokenizer.train_from_iterator(text, trainer) + +#tokenizer.post_processor = BertProcessing( +# ("", tokenizer.token_to_id("")), +# ("", tokenizer.token_to_id("")), +#) + +#tokenizer.enable_padding(pad_id=1, pad_token="") + +tokenizer.save_model("tokenizer") + +print(tokenizer.get_vocab_size()) +