-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_tokenization.py
More file actions
116 lines (90 loc) · 3.63 KB
/
test_tokenization.py
File metadata and controls
116 lines (90 loc) · 3.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#!/usr/bin/env python3
"""
v0.2: 簡単なファインチューニングテスト(データ処理問題を修正)
"""
import json
import torch
from transformers import AutoTokenizer
from pathlib import Path
import logging
# ログ設定
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def test_tokenization():
"""トークナイゼーション処理のテスト"""
print("🧪 Testing Tokenization Process")
print("=" * 40)
# データサンプル読み込み
with open("data/processed/qa_dataset_v2/qa_samples_100.json", 'r', encoding='utf-8') as f:
samples = json.load(f)
print(f"📊 Loaded {len(samples)} samples")
# トークナイザー初期化
tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-v3")
# 最初のサンプルでテスト
sample = samples[0]
print(f"📝 Test Sample:")
print(f" Question: {sample['question']}")
print(f" Context: {sample['context'][:100]}...")
print(f" Answer: {sample['answer']}")
# トークナイゼーション実行
question = sample['question']
context = sample['context']
answer = sample['answer']
# エンコード
encoding = tokenizer(
question,
context,
truncation=True,
padding='max_length',
max_length=512,
return_tensors="pt"
)
print(f"\n🔤 Tokenization Results:")
print(f" Input IDs shape: {encoding['input_ids'].shape}")
print(f" Attention mask shape: {encoding['attention_mask'].shape}")
# 回答位置の検索
answer_start = context.find(answer)
if answer_start == -1:
print(f"⚠️ Answer not found in context. Using start=0")
answer_start = 0
answer_end = answer_start + len(answer) - 1
# トークン位置での検索
answer_tokens = tokenizer.encode(answer, add_special_tokens=False)
context_tokens = tokenizer.encode(context, add_special_tokens=False)
print(f"\n📍 Position Information:")
print(f" Character positions: start={answer_start}, end={answer_end}")
print(f" Answer tokens: {answer_tokens[:5]}...")
print(f" Context tokens: {len(context_tokens)} tokens")
# 簡単なトークナイゼーション関数
def tokenize_sample(sample_data):
q = sample_data['question']
c = sample_data['context']
a = sample_data['answer']
# 基本エンコーディング
encoding = tokenizer(
q, c,
truncation=True,
padding='max_length',
max_length=512,
return_tensors="pt"
)
# 答えの位置を簡易的に設定
start_pos = 1 # [CLS] token の後
end_pos = start_pos + 5 # 適当な長さ
result = {
'input_ids': encoding['input_ids'].squeeze(),
'token_type_ids': encoding['token_type_ids'].squeeze() if 'token_type_ids' in encoding else torch.zeros_like(encoding['input_ids'].squeeze()),
'attention_mask': encoding['attention_mask'].squeeze(),
'start_positions': torch.tensor(start_pos, dtype=torch.long),
'end_positions': torch.tensor(end_pos, dtype=torch.long)
}
return result
# テストトークナイゼーション
processed = tokenize_sample(sample)
print(f"\n✅ Processed Sample:")
for key, value in processed.items():
print(f" {key}: shape={value.shape}, dtype={value.dtype}")
print(f"\n🎯 Tokenization Test Complete!")
return processed
if __name__ == "__main__":
test_tokenization()