@@ -1109,7 +1109,6 @@ struct FluxCLIPEmbedder : public Conditioner {
1109
1109
FluxCLIPEmbedder (ggml_backend_t backend,
1110
1110
std::map<std::string, enum ggml_type>& tensor_types,
1111
1111
int clip_skip = -1 ) {
1112
-
1113
1112
for (auto pair : tensor_types) {
1114
1113
if (pair.first .find (" text_encoders.clip_l" ) != std::string::npos) {
1115
1114
use_clip_l = true ;
@@ -1319,7 +1318,6 @@ struct FluxCLIPEmbedder : public Conditioner {
1319
1318
ggml_set_f32 (chunk_hidden_states, 0 .f );
1320
1319
}
1321
1320
1322
-
1323
1321
int64_t t1 = ggml_time_ms ();
1324
1322
LOG_DEBUG (" computing condition graph completed, taking %" PRId64 " ms" , t1 - t0);
1325
1323
if (force_zero_embeddings) {
@@ -1328,12 +1326,12 @@ struct FluxCLIPEmbedder : public Conditioner {
1328
1326
vec[i] = 0 ;
1329
1327
}
1330
1328
}
1331
-
1329
+
1332
1330
hidden_states_vec.insert (hidden_states_vec.end (),
1333
- (float *)chunk_hidden_states->data ,
1334
- ((float *)chunk_hidden_states->data ) + ggml_nelements (chunk_hidden_states));
1331
+ (float *)chunk_hidden_states->data ,
1332
+ ((float *)chunk_hidden_states->data ) + ggml_nelements (chunk_hidden_states));
1335
1333
}
1336
-
1334
+
1337
1335
if (hidden_states_vec.size () > 0 ) {
1338
1336
hidden_states = vector_to_ggml_tensor (work_ctx, hidden_states_vec);
1339
1337
hidden_states = ggml_reshape_2d (work_ctx,
@@ -1388,35 +1386,54 @@ struct PixArtCLIPEmbedder : public Conditioner {
1388
1386
bool use_mask = false ;
1389
1387
int mask_pad = 1 ;
1390
1388
1389
+ bool use_t5 = false ;
1390
+
1391
1391
PixArtCLIPEmbedder (ggml_backend_t backend,
1392
1392
std::map<std::string, enum ggml_type>& tensor_types,
1393
1393
int clip_skip = -1 ,
1394
1394
bool use_mask = false ,
1395
1395
int mask_pad = 1 )
1396
1396
: use_mask(use_mask), mask_pad(mask_pad) {
1397
- t5 = std::make_shared<T5Runner>(backend, tensor_types, " text_encoders.t5xxl.transformer" );
1397
+ for (auto pair : tensor_types) {
1398
+ if (pair.first .find (" text_encoders.t5xxl" ) != std::string::npos) {
1399
+ use_t5 = true ;
1400
+ }
1401
+ }
1402
+
1403
+ if (!use_t5) {
1404
+ LOG_WARN (" IMPORTANT NOTICE: No text encoders provided, cannot process prompts!" );
1405
+ return ;
1406
+ } else {
1407
+ t5 = std::make_shared<T5Runner>(backend, tensor_types, " text_encoders.t5xxl.transformer" );
1408
+ }
1398
1409
}
1399
1410
1400
1411
void set_clip_skip (int clip_skip) {
1401
1412
}
1402
1413
1403
1414
void get_param_tensors (std::map<std::string, struct ggml_tensor *>& tensors) {
1404
- t5->get_param_tensors (tensors, " text_encoders.t5xxl.transformer" );
1415
+ if (use_t5) {
1416
+ t5->get_param_tensors (tensors, " text_encoders.t5xxl.transformer" );
1417
+ }
1405
1418
}
1406
1419
1407
1420
void alloc_params_buffer () {
1408
- t5->alloc_params_buffer ();
1421
+ if (use_t5) {
1422
+ t5->alloc_params_buffer ();
1423
+ }
1409
1424
}
1410
1425
1411
1426
void free_params_buffer () {
1412
- t5->free_params_buffer ();
1427
+ if (use_t5) {
1428
+ t5->free_params_buffer ();
1429
+ }
1413
1430
}
1414
1431
1415
1432
size_t get_params_buffer_size () {
1416
1433
size_t buffer_size = 0 ;
1417
-
1418
- buffer_size += t5->get_params_buffer_size ();
1419
-
1434
+ if (use_t5) {
1435
+ buffer_size += t5->get_params_buffer_size ();
1436
+ }
1420
1437
return buffer_size;
1421
1438
}
1422
1439
@@ -1442,17 +1459,18 @@ struct PixArtCLIPEmbedder : public Conditioner {
1442
1459
std::vector<int > t5_tokens;
1443
1460
std::vector<float > t5_weights;
1444
1461
std::vector<float > t5_mask;
1445
- for (const auto & item : parsed_attention) {
1446
- const std::string& curr_text = item.first ;
1447
- float curr_weight = item.second ;
1448
-
1449
- std::vector<int > curr_tokens = t5_tokenizer.Encode (curr_text, true );
1450
- t5_tokens.insert (t5_tokens.end (), curr_tokens.begin (), curr_tokens.end ());
1451
- t5_weights.insert (t5_weights.end (), curr_tokens.size (), curr_weight);
1452
- }
1462
+ if (use_t5) {
1463
+ for (const auto & item : parsed_attention) {
1464
+ const std::string& curr_text = item.first ;
1465
+ float curr_weight = item.second ;
1453
1466
1454
- t5_tokenizer.pad_tokens (t5_tokens, t5_weights, &t5_mask, max_length, padding);
1467
+ std::vector<int > curr_tokens = t5_tokenizer.Encode (curr_text, true );
1468
+ t5_tokens.insert (t5_tokens.end (), curr_tokens.begin (), curr_tokens.end ());
1469
+ t5_weights.insert (t5_weights.end (), curr_tokens.size (), curr_weight);
1470
+ }
1455
1471
1472
+ t5_tokenizer.pad_tokens (t5_tokens, t5_weights, &t5_mask, max_length, padding);
1473
+ }
1456
1474
return {t5_tokens, t5_weights, t5_mask};
1457
1475
}
1458
1476
@@ -1489,38 +1507,44 @@ struct PixArtCLIPEmbedder : public Conditioner {
1489
1507
std::vector<float > hidden_states_vec;
1490
1508
1491
1509
size_t chunk_count = t5_tokens.size () / chunk_len;
1492
-
1493
1510
for (int chunk_idx = 0 ; chunk_idx < chunk_count; chunk_idx++) {
1494
1511
// t5
1495
- std::vector<int > chunk_tokens (t5_tokens.begin () + chunk_idx * chunk_len,
1496
- t5_tokens.begin () + (chunk_idx + 1 ) * chunk_len);
1497
- std::vector<float > chunk_weights (t5_weights.begin () + chunk_idx * chunk_len,
1498
- t5_weights.begin () + (chunk_idx + 1 ) * chunk_len);
1499
- std::vector<float > chunk_mask (t5_attn_mask_vec.begin () + chunk_idx * chunk_len,
1500
- t5_attn_mask_vec.begin () + (chunk_idx + 1 ) * chunk_len);
1501
-
1502
- auto input_ids = vector_to_ggml_tensor_i32 (work_ctx, chunk_tokens);
1503
- auto t5_attn_mask_chunk = use_mask ? vector_to_ggml_tensor (work_ctx, chunk_mask) : NULL ;
1504
-
1505
- t5->compute (n_threads,
1506
- input_ids,
1507
- t5_attn_mask_chunk,
1508
- &chunk_hidden_states,
1509
- work_ctx);
1510
- {
1511
- auto tensor = chunk_hidden_states;
1512
- float original_mean = ggml_tensor_mean (tensor);
1513
- for (int i2 = 0 ; i2 < tensor->ne [2 ]; i2++) {
1514
- for (int i1 = 0 ; i1 < tensor->ne [1 ]; i1++) {
1515
- for (int i0 = 0 ; i0 < tensor->ne [0 ]; i0++) {
1516
- float value = ggml_tensor_get_f32 (tensor, i0, i1, i2);
1517
- value *= chunk_weights[i1];
1518
- ggml_tensor_set_f32 (tensor, value, i0, i1, i2);
1512
+
1513
+ if (use_t5) {
1514
+ std::vector<int > chunk_tokens (t5_tokens.begin () + chunk_idx * chunk_len,
1515
+ t5_tokens.begin () + (chunk_idx + 1 ) * chunk_len);
1516
+ std::vector<float > chunk_weights (t5_weights.begin () + chunk_idx * chunk_len,
1517
+ t5_weights.begin () + (chunk_idx + 1 ) * chunk_len);
1518
+ std::vector<float > chunk_mask (t5_attn_mask_vec.begin () + chunk_idx * chunk_len,
1519
+ t5_attn_mask_vec.begin () + (chunk_idx + 1 ) * chunk_len);
1520
+
1521
+ auto input_ids = vector_to_ggml_tensor_i32 (work_ctx, chunk_tokens);
1522
+ auto t5_attn_mask_chunk = use_mask ? vector_to_ggml_tensor (work_ctx, chunk_mask) : NULL ;
1523
+ t5->compute (n_threads,
1524
+ input_ids,
1525
+ t5_attn_mask_chunk,
1526
+ &chunk_hidden_states,
1527
+ work_ctx);
1528
+ {
1529
+ auto tensor = chunk_hidden_states;
1530
+ float original_mean = ggml_tensor_mean (tensor);
1531
+ for (int i2 = 0 ; i2 < tensor->ne [2 ]; i2++) {
1532
+ for (int i1 = 0 ; i1 < tensor->ne [1 ]; i1++) {
1533
+ for (int i0 = 0 ; i0 < tensor->ne [0 ]; i0++) {
1534
+ float value = ggml_tensor_get_f32 (tensor, i0, i1, i2);
1535
+ value *= chunk_weights[i1];
1536
+ ggml_tensor_set_f32 (tensor, value, i0, i1, i2);
1537
+ }
1519
1538
}
1520
1539
}
1540
+ float new_mean = ggml_tensor_mean (tensor);
1541
+ ggml_tensor_scale (tensor, (original_mean / new_mean));
1521
1542
}
1522
- float new_mean = ggml_tensor_mean (tensor);
1523
- ggml_tensor_scale (tensor, (original_mean / new_mean));
1543
+ } else {
1544
+ chunk_hidden_states = ggml_new_tensor_2d (work_ctx, GGML_TYPE_F32, 4096 , chunk_len);
1545
+ ggml_set_f32 (chunk_hidden_states, 0 .f );
1546
+ t5_attn_mask = ggml_new_tensor_1d (work_ctx, GGML_TYPE_F32, chunk_len);
1547
+ ggml_set_f32 (t5_attn_mask, -HUGE_VALF);
1524
1548
}
1525
1549
1526
1550
int64_t t1 = ggml_time_ms ();
@@ -1544,8 +1568,10 @@ struct PixArtCLIPEmbedder : public Conditioner {
1544
1568
chunk_hidden_states->ne [0 ],
1545
1569
ggml_nelements (hidden_states) / chunk_hidden_states->ne [0 ]);
1546
1570
} else {
1547
- hidden_states = ggml_new_tensor_2d (work_ctx, GGML_TYPE_F32, 4096 , 256 );
1571
+ hidden_states = ggml_new_tensor_2d (work_ctx, GGML_TYPE_F32, 4096 , chunk_len );
1548
1572
ggml_set_f32 (hidden_states, 0 .f );
1573
+ t5_attn_mask = ggml_new_tensor_1d (work_ctx, GGML_TYPE_F32, chunk_len);
1574
+ ggml_set_f32 (t5_attn_mask, -HUGE_VALF);
1549
1575
}
1550
1576
1551
1577
modify_mask_to_attend_padding (t5_attn_mask, ggml_nelements (t5_attn_mask), mask_pad);
0 commit comments