0%| | 1/13317 [00:01<5:25:10, 1.47s/it] {'loss': 0.9494, 'grad_norm': 2.646820306777954, 'learning_rate': 7.507507507507508e-09, 'epoch': 0.0} 0%| | 1/13317 [00:01<5:25:10, 1.47s/it] 0%| | 2/13317 [00:02<3:25:03, 1.08it/s] {'loss': 0.865, 'grad_norm': 2.3400673866271973, 'learning_rate': 1.5015015015015016e-08, 'epoch': 0.0} 0%| | 2/13317 [00:02<3:25:03, 1.08it/s] 0%| | 3/13317 [00:02<2:53:23, 1.28it/s] {'loss': 0.8431, 'grad_norm': 2.388704299926758, 'learning_rate': 2.2522522522522524e-08, 'epoch': 0.0} 0%| | 3/13317 [00:02<2:53:23, 1.28it/s] 0%| | 4/13317 [00:03<2:38:43, 1.40it/s] {'loss': 0.8816, 'grad_norm': 2.6599130630493164, 'learning_rate': 3.003003003003003e-08, 'epoch': 0.0} 0%| | 4/13317 [00:03<2:38:43, 1.40it/s] 0%| | 5/13317 [00:03<2:30:46, 1.47it/s] {'loss': 1.0193, 'grad_norm': 2.8670766353607178, 'learning_rate': 3.753753753753754e-08, 'epoch': 0.0} 0%| | 5/13317 [00:03<2:30:46, 1.47it/s] 0%| | 6/13317 [00:04<2:25:59, 1.52it/s] {'loss': 0.8743, 'grad_norm': 2.447897434234619, 'learning_rate': 4.504504504504505e-08, 'epoch': 0.0} 0%| | 6/13317 [00:04<2:25:59, 1.52it/s] 0%| | 7/13317 [00:05<2:22:48, 1.55it/s] {'loss': 1.0061, 'grad_norm': 2.9875383377075195, 'learning_rate': 5.255255255255256e-08, 'epoch': 0.0} 0%| | 7/13317 [00:05<2:22:48, 1.55it/s] 0%| | 8/13317 [00:05<2:20:38, 1.58it/s] {'loss': 0.974, 'grad_norm': 2.3990159034729004, 'learning_rate': 6.006006006006006e-08, 'epoch': 0.0} 0%| | 8/13317 [00:05<2:20:38, 1.58it/s] 0%| | 9/13317 [00:06<2:19:14, 1.59it/s] {'loss': 1.2645, 'grad_norm': 2.6647093296051025, 'learning_rate': 6.756756756756757e-08, 'epoch': 0.0} 0%| | 9/13317 [00:06<2:19:14, 1.59it/s] 0%| | 10/13317 [00:06<2:18:46, 1.60it/s] {'loss': 0.9874, 'grad_norm': 2.5233733654022217, 'learning_rate': 7.507507507507508e-08, 'epoch': 0.0} 0%| | 10/13317 [00:07<2:18:46, 1.60it/s] 0%| | 11/13317 [00:07<2:17:58, 1.61it/s] {'loss': 0.899, 'grad_norm': 2.0676119327545166, 'learning_rate': 8.258258258258259e-08, 'epoch': 0.0} 0%| | 11/13317 [00:07<2:17:58, 1.61it/s] 0%| | 12/13317 [00:08<2:17:30, 1.61it/s] {'loss': 0.9271, 'grad_norm': 2.888087034225464, 'learning_rate': 9.00900900900901e-08, 'epoch': 0.0} 0%| | 12/13317 [00:08<2:17:30, 1.61it/s] 0%| | 13/13317 [00:08<2:17:09, 1.62it/s] {'loss': 1.0218, 'grad_norm': 3.219238042831421, 'learning_rate': 9.75975975975976e-08, 'epoch': 0.0} 0%| | 13/13317 [00:08<2:17:09, 1.62it/s] 0%| | 14/13317 [00:09<2:16:52, 1.62it/s] {'loss': 1.1003, 'grad_norm': 3.288654327392578, 'learning_rate': 1.0510510510510513e-07, 'epoch': 0.0} 0%| | 14/13317 [00:09<2:16:52, 1.62it/s] 0%| | 15/13317 [00:10<2:16:57, 1.62it/s] {'loss': 1.1175, 'grad_norm': 3.4702346324920654, 'learning_rate': 1.1261261261261262e-07, 'epoch': 0.0} 0%| | 15/13317 [00:10<2:16:57, 1.62it/s] 0%| | 16/13317 [00:10<2:16:43, 1.62it/s] {'loss': 0.7981, 'grad_norm': 2.412877321243286, 'learning_rate': 1.2012012012012013e-07, 'epoch': 0.0} 0%| | 16/13317 [00:10<2:16:43, 1.62it/s] 0%| | 17/13317 [00:11<2:16:36, 1.62it/s] {'loss': 0.9397, 'grad_norm': 2.726624011993408, 'learning_rate': 1.2762762762762765e-07, 'epoch': 0.0} 0%| | 17/13317 [00:11<2:16:36, 1.62it/s] 0%| | 18/13317 [00:11<2:16:27, 1.62it/s] {'loss': 0.9479, 'grad_norm': 2.877187967300415, 'learning_rate': 1.3513513513513515e-07, 'epoch': 0.0} 0%| | 18/13317 [00:11<2:16:27, 1.62it/s] 0%| | 19/13317 [00:12<2:16:26, 1.62it/s] {'loss': 0.9864, 'grad_norm': 3.8969573974609375, 'learning_rate': 1.4264264264264264e-07, 'epoch': 0.0} 0%| | 19/13317 [00:12<2:16:26, 1.62it/s] 0%| | 20/13317 [00:13<2:16:41, 1.62it/s] {'loss': 0.9975, 'grad_norm': 3.142580032348633, 'learning_rate': 1.5015015015015016e-07, 'epoch': 0.0} 0%| | 20/13317 [00:13<2:16:41, 1.62it/s] 0%| | 21/13317 [00:13<2:16:48, 1.62it/s] {'loss': 0.8595, 'grad_norm': 2.11672043800354, 'learning_rate': 1.5765765765765766e-07, 'epoch': 0.0} 0%| | 21/13317 [00:13<2:16:48, 1.62it/s] 0%| | 22/13317 [00:14<2:16:35, 1.62it/s] {'loss': 0.9348, 'grad_norm': 2.5887250900268555, 'learning_rate': 1.6516516516516518e-07, 'epoch': 0.0} 0%| | 22/13317 [00:14<2:16:35, 1.62it/s] 0%| | 23/13317 [00:14<2:16:28, 1.62it/s] {'loss': 1.0693, 'grad_norm': 3.3582563400268555, 'learning_rate': 1.7267267267267267e-07, 'epoch': 0.01} 0%| | 23/13317 [00:15<2:16:28, 1.62it/s] 0%| | 24/13317 [00:15<2:16:23, 1.62it/s] {'loss': 0.9229, 'grad_norm': 2.886667490005493, 'learning_rate': 1.801801801801802e-07, 'epoch': 0.01} 0%| | 24/13317 [00:15<2:16:23, 1.62it/s] 0%| | 25/13317 [00:16<2:16:33, 1.62it/s] {'loss': 0.8707, 'grad_norm': 2.1457412242889404, 'learning_rate': 1.8768768768768772e-07, 'epoch': 0.01} 0%| | 25/13317 [00:16<2:16:33, 1.62it/s] 0%| | 26/13317 [00:16<2:16:22, 1.62it/s] {'loss': 0.9686, 'grad_norm': 2.731424331665039, 'learning_rate': 1.951951951951952e-07, 'epoch': 0.01} 0%| | 26/13317 [00:16<2:16:22, 1.62it/s] 0%| | 27/13317 [00:17<2:16:12, 1.63it/s] {'loss': 0.9873, 'grad_norm': 2.6451382637023926, 'learning_rate': 2.0270270270270273e-07, 'epoch': 0.01} 0%| | 27/13317 [00:17<2:16:12, 1.63it/s] 0%| | 28/13317 [00:18<2:16:08, 1.63it/s] {'loss': 0.9351, 'grad_norm': 2.7847933769226074, 'learning_rate': 2.1021021021021025e-07, 'epoch': 0.01} 0%| | 28/13317 [00:18<2:16:08, 1.63it/s] 0%| | 29/13317 [00:18<2:16:04, 1.63it/s] {'loss': 0.9784, 'grad_norm': 2.752998113632202, 'learning_rate': 2.1771771771771772e-07, 'epoch': 0.01} 0%| | 29/13317 [00:18<2:16:04, 1.63it/s] 0%| | 30/13317 [00:19<2:16:13, 1.63it/s] {'loss': 1.0221, 'grad_norm': 3.005768299102783, 'learning_rate': 2.2522522522522524e-07, 'epoch': 0.01} 0%| | 30/13317 [00:19<2:16:13, 1.63it/s] 0%| | 31/13317 [00:19<2:16:04, 1.63it/s] {'loss': 0.9297, 'grad_norm': 2.387977123260498, 'learning_rate': 2.3273273273273274e-07, 'epoch': 0.01} 0%| | 31/13317 [00:19<2:16:04, 1.63it/s] 0%| | 32/13317 [00:20<2:16:02, 1.63it/s] {'loss': 0.9542, 'grad_norm': 2.441345691680908, 'learning_rate': 2.4024024024024026e-07, 'epoch': 0.01} 0%| | 32/13317 [00:20<2:16:02, 1.63it/s] 0%| | 33/13317 [00:21<2:15:57, 1.63it/s] {'loss': 0.9051, 'grad_norm': 3.5449955463409424, 'learning_rate': 2.477477477477478e-07, 'epoch': 0.01} 0%| | 33/13317 [00:21<2:15:57, 1.63it/s] 0%| | 34/13317 [00:21<2:15:52, 1.63it/s] {'loss': 0.8718, 'grad_norm': 2.3236653804779053, 'learning_rate': 2.552552552552553e-07, 'epoch': 0.01} 0%| | 34/13317 [00:21<2:15:52, 1.63it/s] 0%| | 35/13317 [00:22<2:15:55, 1.63it/s] {'loss': 0.9744, 'grad_norm': 2.996504068374634, 'learning_rate': 2.6276276276276277e-07, 'epoch': 0.01} 0%| | 35/13317 [00:22<2:15:55, 1.63it/s] 0%| | 36/13317 [00:22<2:15:54, 1.63it/s] {'loss': 1.0229, 'grad_norm': 3.263207197189331, 'learning_rate': 2.702702702702703e-07, 'epoch': 0.01} 0%| | 36/13317 [00:23<2:15:54, 1.63it/s] 0%| | 37/13317 [00:23<2:15:52, 1.63it/s] {'loss': 0.8876, 'grad_norm': 2.4195640087127686, 'learning_rate': 2.7777777777777776e-07, 'epoch': 0.01} 0%| | 37/13317 [00:23<2:15:52, 1.63it/s] 0%| | 38/13317 [00:24<2:15:47, 1.63it/s] {'loss': 0.9029, 'grad_norm': 3.4358081817626953, 'learning_rate': 2.852852852852853e-07, 'epoch': 0.01} 0%| | 38/13317 [00:24<2:15:47, 1.63it/s] 0%| | 39/13317 [00:24<2:15:44, 1.63it/s] {'loss': 0.9481, 'grad_norm': 2.456697940826416, 'learning_rate': 2.927927927927928e-07, 'epoch': 0.01} 0%| | 39/13317 [00:24<2:15:44, 1.63it/s] 0%| | 40/13317 [00:25<2:15:51, 1.63it/s] {'loss': 0.8763, 'grad_norm': 2.5605924129486084, 'learning_rate': 3.003003003003003e-07, 'epoch': 0.01} 0%| | 40/13317 [00:25<2:15:51, 1.63it/s] 0%| | 41/13317 [00:25<2:15:44, 1.63it/s] {'loss': 0.8957, 'grad_norm': 2.5799906253814697, 'learning_rate': 3.0780780780780784e-07, 'epoch': 0.01} 0%| | 41/13317 [00:26<2:15:44, 1.63it/s] 0%| | 42/13317 [00:26<2:15:41, 1.63it/s] {'loss': 0.8849, 'grad_norm': 2.680360794067383, 'learning_rate': 3.153153153153153e-07, 'epoch': 0.01} 0%| | 42/13317 [00:26<2:15:41, 1.63it/s] 0%| | 43/13317 [00:27<2:15:51, 1.63it/s] {'loss': 0.9718, 'grad_norm': 2.6914453506469727, 'learning_rate': 3.2282282282282283e-07, 'epoch': 0.01} 0%| | 43/13317 [00:27<2:15:51, 1.63it/s] 0%| | 44/13317 [00:27<2:15:50, 1.63it/s] {'loss': 0.9262, 'grad_norm': 2.440049171447754, 'learning_rate': 3.3033033033033036e-07, 'epoch': 0.01} 0%| | 44/13317 [00:27<2:15:50, 1.63it/s] 0%| | 45/13317 [00:28<2:15:51, 1.63it/s] {'loss': 0.8985, 'grad_norm': 2.3033666610717773, 'learning_rate': 3.378378378378379e-07, 'epoch': 0.01} 0%| | 45/13317 [00:28<2:15:51, 1.63it/s] 0%| | 46/13317 [00:29<2:15:49, 1.63it/s] {'loss': 1.048, 'grad_norm': 2.7851650714874268, 'learning_rate': 3.4534534534534535e-07, 'epoch': 0.01} 0%| | 46/13317 [00:29<2:15:49, 1.63it/s] 0%| | 47/13317 [00:29<2:15:43, 1.63it/s] {'loss': 0.8574, 'grad_norm': 2.1973562240600586, 'learning_rate': 3.528528528528529e-07, 'epoch': 0.01} 0%| | 47/13317 [00:29<2:15:43, 1.63it/s] 0%| | 48/13317 [00:30<2:15:40, 1.63it/s] {'loss': 0.8476, 'grad_norm': 2.209846019744873, 'learning_rate': 3.603603603603604e-07, 'epoch': 0.01} 0%| | 48/13317 [00:30<2:15:40, 1.63it/s] 0%| | 49/13317 [00:30<2:15:37, 1.63it/s] {'loss': 0.7862, 'grad_norm': 2.46240234375, 'learning_rate': 3.6786786786786786e-07, 'epoch': 0.01} 0%| | 49/13317 [00:31<2:15:37, 1.63it/s] 0%| | 50/13317 [00:31<2:15:36, 1.63it/s] {'loss': 1.0251, 'grad_norm': 3.1315271854400635, 'learning_rate': 3.7537537537537543e-07, 'epoch': 0.01} 0%| | 50/13317 [00:31<2:15:36, 1.63it/s] 0%| | 51/13317 [00:32<2:15:33, 1.63it/s] {'loss': 0.9626, 'grad_norm': 2.0481953620910645, 'learning_rate': 3.828828828828829e-07, 'epoch': 0.01} 0%| | 51/13317 [00:32<2:15:33, 1.63it/s] 0%| | 52/13317 [00:32<2:15:37, 1.63it/s] {'loss': 0.9023, 'grad_norm': 3.5687384605407715, 'learning_rate': 3.903903903903904e-07, 'epoch': 0.01} 0%| | 52/13317 [00:32<2:15:37, 1.63it/s] 0%| | 53/13317 [00:33<2:15:58, 1.63it/s] {'loss': 0.8777, 'grad_norm': 2.469125747680664, 'learning_rate': 3.978978978978979e-07, 'epoch': 0.01} 0%| | 53/13317 [00:33<2:15:58, 1.63it/s] 0%| | 54/13317 [00:33<2:15:44, 1.63it/s] {'loss': 0.9081, 'grad_norm': 2.282897472381592, 'learning_rate': 4.0540540540540546e-07, 'epoch': 0.01} 0%| | 54/13317 [00:34<2:15:44, 1.63it/s] 0%| | 55/13317 [00:34<2:15:40, 1.63it/s] {'loss': 0.8049, 'grad_norm': 2.256535291671753, 'learning_rate': 4.1291291291291293e-07, 'epoch': 0.01} 0%| | 55/13317 [00:34<2:15:40, 1.63it/s] 0%| | 56/13317 [00:35<2:15:42, 1.63it/s] {'loss': 0.8434, 'grad_norm': 1.9026448726654053, 'learning_rate': 4.204204204204205e-07, 'epoch': 0.01} 0%| | 56/13317 [00:35<2:15:42, 1.63it/s] 0%| | 57/13317 [00:35<2:15:45, 1.63it/s] {'loss': 0.992, 'grad_norm': 2.9717507362365723, 'learning_rate': 4.27927927927928e-07, 'epoch': 0.01} 0%| | 57/13317 [00:35<2:15:45, 1.63it/s] 0%| | 58/13317 [00:36<2:15:43, 1.63it/s] {'loss': 0.8866, 'grad_norm': 2.3037383556365967, 'learning_rate': 4.3543543543543544e-07, 'epoch': 0.01} 0%| | 58/13317 [00:36<2:15:43, 1.63it/s] 0%| | 59/13317 [00:37<2:15:43, 1.63it/s] {'loss': 0.9086, 'grad_norm': 3.0257582664489746, 'learning_rate': 4.4294294294294296e-07, 'epoch': 0.01} 0%| | 59/13317 [00:37<2:15:43, 1.63it/s] 0%| | 60/13317 [00:37<2:15:48, 1.63it/s] {'loss': 0.8835, 'grad_norm': 2.610701084136963, 'learning_rate': 4.504504504504505e-07, 'epoch': 0.01} 0%| | 60/13317 [00:37<2:15:48, 1.63it/s] 0%| | 61/13317 [00:38<2:15:42, 1.63it/s] {'loss': 1.0007, 'grad_norm': 3.014319896697998, 'learning_rate': 4.57957957957958e-07, 'epoch': 0.01} 0%| | 61/13317 [00:38<2:15:42, 1.63it/s] 0%| | 62/13317 [00:38<2:15:41, 1.63it/s] {'loss': 1.0755, 'grad_norm': 2.6657915115356445, 'learning_rate': 4.654654654654655e-07, 'epoch': 0.01} 0%| | 62/13317 [00:39<2:15:41, 1.63it/s] 0%| | 63/13317 [00:39<2:15:35, 1.63it/s] {'loss': 0.8531, 'grad_norm': 3.8196001052856445, 'learning_rate': 4.7297297297297305e-07, 'epoch': 0.01} 0%| | 63/13317 [00:39<2:15:35, 1.63it/s] 0%| | 64/13317 [00:40<2:15:32, 1.63it/s] {'loss': 0.9078, 'grad_norm': 2.7949588298797607, 'learning_rate': 4.804804804804805e-07, 'epoch': 0.01} 0%| | 64/13317 [00:40<2:15:32, 1.63it/s] 0%| | 65/13317 [00:40<2:15:27, 1.63it/s] {'loss': 0.9631, 'grad_norm': 2.4826414585113525, 'learning_rate': 4.87987987987988e-07, 'epoch': 0.01} 0%| | 65/13317 [00:40<2:15:27, 1.63it/s] 0%| | 66/13317 [00:41<2:15:32, 1.63it/s] {'loss': 1.0726, 'grad_norm': 2.6283771991729736, 'learning_rate': 4.954954954954956e-07, 'epoch': 0.01} 0%| | 66/13317 [00:41<2:15:32, 1.63it/s] 1%| | 67/13317 [00:41<2:15:29, 1.63it/s] {'loss': 1.0398, 'grad_norm': 2.652327537536621, 'learning_rate': 5.03003003003003e-07, 'epoch': 0.02} 1%| | 67/13317 [00:42<2:15:29, 1.63it/s] 1%| | 68/13317 [00:42<2:15:31, 1.63it/s] {'loss': 0.9084, 'grad_norm': 2.3473963737487793, 'learning_rate': 5.105105105105106e-07, 'epoch': 0.02} 1%| | 68/13317 [00:42<2:15:31, 1.63it/s] 1%| | 69/13317 [00:43<2:15:31, 1.63it/s] {'loss': 0.8491, 'grad_norm': 1.987104892730713, 'learning_rate': 5.180180180180181e-07, 'epoch': 0.02} 1%| | 69/13317 [00:43<2:15:31, 1.63it/s] 1%| | 70/13317 [00:43<2:15:27, 1.63it/s] {'loss': 0.81, 'grad_norm': 1.9609936475753784, 'learning_rate': 5.255255255255255e-07, 'epoch': 0.02} 1%| | 70/13317 [00:43<2:15:27, 1.63it/s] 1%| | 71/13317 [00:44<2:15:31, 1.63it/s] {'loss': 0.8824, 'grad_norm': 2.3594648838043213, 'learning_rate': 5.330330330330331e-07, 'epoch': 0.02} 1%| | 71/13317 [00:44<2:15:31, 1.63it/s] 1%| | 72/13317 [00:45<2:15:25, 1.63it/s] {'loss': 0.9915, 'grad_norm': 2.994713306427002, 'learning_rate': 5.405405405405406e-07, 'epoch': 0.02} 1%| | 72/13317 [00:45<2:15:25, 1.63it/s] 1%| | 73/13317 [00:45<2:15:22, 1.63it/s] {'loss': 0.9885, 'grad_norm': 2.5328166484832764, 'learning_rate': 5.48048048048048e-07, 'epoch': 0.02} 1%| | 73/13317 [00:45<2:15:22, 1.63it/s] 1%| | 74/13317 [00:46<2:15:27, 1.63it/s] {'loss': 1.043, 'grad_norm': 2.9895694255828857, 'learning_rate': 5.555555555555555e-07, 'epoch': 0.02} 1%| | 74/13317 [00:46<2:15:27, 1.63it/s] 1%| | 75/13317 [00:46<2:15:25, 1.63it/s] {'loss': 0.8529, 'grad_norm': 2.164886951446533, 'learning_rate': 5.630630630630631e-07, 'epoch': 0.02} 1%| | 75/13317 [00:46<2:15:25, 1.63it/s] 1%| | 76/13317 [00:47<2:15:23, 1.63it/s] {'loss': 0.9133, 'grad_norm': 2.141491651535034, 'learning_rate': 5.705705705705706e-07, 'epoch': 0.02} 1%| | 76/13317 [00:47<2:15:23, 1.63it/s] 1%| | 77/13317 [00:48<2:15:21, 1.63it/s] {'loss': 0.9168, 'grad_norm': 2.2878506183624268, 'learning_rate': 5.780780780780781e-07, 'epoch': 0.02} 1%| | 77/13317 [00:48<2:15:21, 1.63it/s] 1%| | 78/13317 [00:48<2:15:19, 1.63it/s] {'loss': 0.8776, 'grad_norm': 2.0337722301483154, 'learning_rate': 5.855855855855856e-07, 'epoch': 0.02} 1%| | 78/13317 [00:48<2:15:19, 1.63it/s] 1%| | 79/13317 [00:49<2:15:19, 1.63it/s] {'loss': 0.7832, 'grad_norm': 2.0297250747680664, 'learning_rate': 5.930930930930932e-07, 'epoch': 0.02} 1%| | 79/13317 [00:49<2:15:19, 1.63it/s] 1%| | 80/13317 [00:49<2:15:18, 1.63it/s] {'loss': 0.98, 'grad_norm': 2.2675063610076904, 'learning_rate': 6.006006006006006e-07, 'epoch': 0.02} 1%| | 80/13317 [00:50<2:15:18, 1.63it/s] 1%| | 81/13317 [00:50<2:15:22, 1.63it/s] {'loss': 0.8088, 'grad_norm': 2.5821239948272705, 'learning_rate': 6.081081081081082e-07, 'epoch': 0.02} 1%| | 81/13317 [00:50<2:15:22, 1.63it/s] 1%| | 82/13317 [00:51<2:15:21, 1.63it/s] {'loss': 0.8705, 'grad_norm': 2.456202507019043, 'learning_rate': 6.156156156156157e-07, 'epoch': 0.02} 1%| | 82/13317 [00:51<2:15:21, 1.63it/s] 1%| | 83/13317 [00:51<2:15:20, 1.63it/s] {'loss': 0.7088, 'grad_norm': 1.882301688194275, 'learning_rate': 6.231231231231232e-07, 'epoch': 0.02} 1%| | 83/13317 [00:51<2:15:20, 1.63it/s] 1%| | 84/13317 [00:52<2:15:17, 1.63it/s] {'loss': 0.7914, 'grad_norm': 1.9859850406646729, 'learning_rate': 6.306306306306306e-07, 'epoch': 0.02} 1%| | 84/13317 [00:52<2:15:17, 1.63it/s] 1%| | 85/13317 [00:52<2:15:14, 1.63it/s] {'loss': 1.0035, 'grad_norm': 2.474757671356201, 'learning_rate': 6.381381381381382e-07, 'epoch': 0.02} 1%| | 85/13317 [00:53<2:15:14, 1.63it/s] 1%| | 86/13317 [00:53<2:15:13, 1.63it/s] {'loss': 0.8838, 'grad_norm': 2.3561854362487793, 'learning_rate': 6.456456456456457e-07, 'epoch': 0.02} 1%| | 86/13317 [00:53<2:15:13, 1.63it/s] 1%| | 87/13317 [00:54<2:15:14, 1.63it/s] {'loss': 0.9739, 'grad_norm': 2.212247371673584, 'learning_rate': 6.531531531531532e-07, 'epoch': 0.02} 1%| | 87/13317 [00:54<2:15:14, 1.63it/s] 1%| | 88/13317 [00:54<2:15:07, 1.63it/s] {'loss': 0.7848, 'grad_norm': 2.2994441986083984, 'learning_rate': 6.606606606606607e-07, 'epoch': 0.02} 1%| | 88/13317 [00:54<2:15:07, 1.63it/s] 1%| | 89/13317 [00:55<2:15:10, 1.63it/s] {'loss': 0.7899, 'grad_norm': 2.832188606262207, 'learning_rate': 6.681681681681683e-07, 'epoch': 0.02} 1%| | 89/13317 [00:55<2:15:10, 1.63it/s] 1%| | 90/13317 [00:56<2:15:12, 1.63it/s] {'loss': 0.8165, 'grad_norm': 1.9242957830429077, 'learning_rate': 6.756756756756758e-07, 'epoch': 0.02} 1%| | 90/13317 [00:56<2:15:12, 1.63it/s] 1%| | 91/13317 [00:56<2:15:08, 1.63it/s] {'loss': 0.9076, 'grad_norm': 2.4717724323272705, 'learning_rate': 6.831831831831832e-07, 'epoch': 0.02} 1%| | 91/13317 [00:56<2:15:08, 1.63it/s] 1%| | 92/13317 [00:57<2:15:10, 1.63it/s] {'loss': 0.9452, 'grad_norm': 2.2474710941314697, 'learning_rate': 6.906906906906907e-07, 'epoch': 0.02} 1%| | 92/13317 [00:57<2:15:10, 1.63it/s] 1%| | 93/13317 [00:57<2:15:10, 1.63it/s] {'loss': 0.852, 'grad_norm': 2.179138660430908, 'learning_rate': 6.981981981981982e-07, 'epoch': 0.02} 1%| | 93/13317 [00:58<2:15:10, 1.63it/s] 1%| | 94/13317 [00:58<2:15:10, 1.63it/s] {'loss': 0.8762, 'grad_norm': 2.266545295715332, 'learning_rate': 7.057057057057058e-07, 'epoch': 0.02} 1%| | 94/13317 [00:58<2:15:10, 1.63it/s] 1%| | 95/13317 [00:59<2:15:08, 1.63it/s] {'loss': 0.8161, 'grad_norm': 2.158430337905884, 'learning_rate': 7.132132132132133e-07, 'epoch': 0.02} 1%| | 95/13317 [00:59<2:15:08, 1.63it/s] 1%| | 96/13317 [00:59<2:15:10, 1.63it/s] {'loss': 0.7645, 'grad_norm': 2.0708978176116943, 'learning_rate': 7.207207207207208e-07, 'epoch': 0.02} 1%| | 96/13317 [00:59<2:15:10, 1.63it/s] 1%| | 97/13317 [01:00<2:15:06, 1.63it/s] {'loss': 0.7951, 'grad_norm': 2.2121434211730957, 'learning_rate': 7.282282282282282e-07, 'epoch': 0.02} 1%| | 97/13317 [01:00<2:15:06, 1.63it/s] 1%| | 98/13317 [01:00<2:15:04, 1.63it/s] {'loss': 0.8189, 'grad_norm': 2.240325927734375, 'learning_rate': 7.357357357357357e-07, 'epoch': 0.02} 1%| | 98/13317 [01:01<2:15:04, 1.63it/s] 1%| | 99/13317 [01:01<2:15:09, 1.63it/s] {'loss': 0.8679, 'grad_norm': 1.895703911781311, 'learning_rate': 7.432432432432434e-07, 'epoch': 0.02} 1%| | 99/13317 [01:01<2:15:09, 1.63it/s] 1%| | 100/13317 [01:02<2:15:13, 1.63it/s] {'loss': 0.8382, 'grad_norm': 1.9695020914077759, 'learning_rate': 7.507507507507509e-07, 'epoch': 0.02} 1%| | 100/13317 [01:02<2:15:13, 1.63it/s] 1%| | 101/13317 [01:02<2:15:17, 1.63it/s] {'loss': 0.7639, 'grad_norm': 1.8849538564682007, 'learning_rate': 7.582582582582583e-07, 'epoch': 0.02} 1%| | 101/13317 [01:02<2:15:17, 1.63it/s] 1%| | 102/13317 [01:03<2:15:17, 1.63it/s] {'loss': 0.8949, 'grad_norm': 1.8158146142959595, 'learning_rate': 7.657657657657658e-07, 'epoch': 0.02} 1%| | 102/13317 [01:03<2:15:17, 1.63it/s] 1%| | 103/13317 [01:04<2:15:17, 1.63it/s] {'loss': 0.7631, 'grad_norm': 1.7382808923721313, 'learning_rate': 7.732732732732734e-07, 'epoch': 0.02} 1%| | 103/13317 [01:04<2:15:17, 1.63it/s] 1%| | 104/13317 [01:04<2:15:17, 1.63it/s] {'loss': 0.7894, 'grad_norm': 1.8516268730163574, 'learning_rate': 7.807807807807808e-07, 'epoch': 0.02} 1%| | 104/13317 [01:04<2:15:17, 1.63it/s] 1%| | 105/13317 [01:05<2:15:20, 1.63it/s] {'loss': 0.8441, 'grad_norm': 1.9690122604370117, 'learning_rate': 7.882882882882883e-07, 'epoch': 0.02} 1%| | 105/13317 [01:05<2:15:20, 1.63it/s] 1%| | 106/13317 [01:05<2:15:14, 1.63it/s] {'loss': 0.838, 'grad_norm': 1.9234217405319214, 'learning_rate': 7.957957957957958e-07, 'epoch': 0.02} 1%| | 106/13317 [01:06<2:15:14, 1.63it/s] 1%| | 107/13317 [01:06<2:15:11, 1.63it/s] {'loss': 0.7771, 'grad_norm': 1.6662170886993408, 'learning_rate': 8.033033033033034e-07, 'epoch': 0.02} 1%| | 107/13317 [01:06<2:15:11, 1.63it/s] 1%| | 108/13317 [01:07<2:15:11, 1.63it/s] {'loss': 0.7269, 'grad_norm': 2.324514865875244, 'learning_rate': 8.108108108108109e-07, 'epoch': 0.02} 1%| | 108/13317 [01:07<2:15:11, 1.63it/s] 1%| | 109/13317 [01:07<2:15:07, 1.63it/s] {'loss': 0.9361, 'grad_norm': 2.0554516315460205, 'learning_rate': 8.183183183183184e-07, 'epoch': 0.02} 1%| | 109/13317 [01:07<2:15:07, 1.63it/s] 1%| | 110/13317 [01:08<2:15:12, 1.63it/s] {'loss': 0.8198, 'grad_norm': 1.7538983821868896, 'learning_rate': 8.258258258258259e-07, 'epoch': 0.02} 1%| | 110/13317 [01:08<2:15:12, 1.63it/s] 1%| | 111/13317 [01:08<2:15:09, 1.63it/s] {'loss': 0.7424, 'grad_norm': 1.5495929718017578, 'learning_rate': 8.333333333333333e-07, 'epoch': 0.03} 1%| | 111/13317 [01:09<2:15:09, 1.63it/s] 1%| | 112/13317 [01:09<2:15:01, 1.63it/s] {'loss': 0.8447, 'grad_norm': 1.9926998615264893, 'learning_rate': 8.40840840840841e-07, 'epoch': 0.03} 1%| | 112/13317 [01:09<2:15:01, 1.63it/s] 1%| | 113/13317 [01:10<2:14:58, 1.63it/s] {'loss': 0.7471, 'grad_norm': 1.607262372970581, 'learning_rate': 8.483483483483485e-07, 'epoch': 0.03} 1%| | 113/13317 [01:10<2:14:58, 1.63it/s] 1%| | 114/13317 [01:10<2:15:00, 1.63it/s] {'loss': 0.7899, 'grad_norm': 2.0737133026123047, 'learning_rate': 8.55855855855856e-07, 'epoch': 0.03} 1%| | 114/13317 [01:10<2:15:00, 1.63it/s] 1%| | 115/13317 [01:11<2:15:02, 1.63it/s] {'loss': 0.9086, 'grad_norm': 1.8293637037277222, 'learning_rate': 8.633633633633634e-07, 'epoch': 0.03} 1%| | 115/13317 [01:11<2:15:02, 1.63it/s] 1%| | 116/13317 [01:12<2:14:59, 1.63it/s] {'loss': 0.7126, 'grad_norm': 1.5163137912750244, 'learning_rate': 8.708708708708709e-07, 'epoch': 0.03} 1%| | 116/13317 [01:12<2:14:59, 1.63it/s] 1%| | 117/13317 [01:12<2:14:58, 1.63it/s] {'loss': 0.7203, 'grad_norm': 1.5233790874481201, 'learning_rate': 8.783783783783785e-07, 'epoch': 0.03} 1%| | 117/13317 [01:12<2:14:58, 1.63it/s] 1%| | 118/13317 [01:13<2:14:58, 1.63it/s] {'loss': 0.7637, 'grad_norm': 1.7197405099868774, 'learning_rate': 8.858858858858859e-07, 'epoch': 0.03} 1%| | 118/13317 [01:13<2:14:58, 1.63it/s] 1%| | 119/13317 [01:13<2:14:52, 1.63it/s] {'loss': 0.8108, 'grad_norm': 1.9985202550888062, 'learning_rate': 8.933933933933935e-07, 'epoch': 0.03} 1%| | 119/13317 [01:13<2:14:52, 1.63it/s] 1%| | 120/13317 [01:14<2:15:01, 1.63it/s] {'loss': 0.6914, 'grad_norm': 1.7084733247756958, 'learning_rate': 9.00900900900901e-07, 'epoch': 0.03} 1%| | 120/13317 [01:14<2:15:01, 1.63it/s] 1%| | 121/13317 [01:15<2:14:58, 1.63it/s] {'loss': 0.7111, 'grad_norm': 1.5084526538848877, 'learning_rate': 9.084084084084085e-07, 'epoch': 0.03} 1%| | 121/13317 [01:15<2:14:58, 1.63it/s] 1%| | 122/13317 [01:15<2:14:54, 1.63it/s] {'loss': 0.7237, 'grad_norm': 1.5757193565368652, 'learning_rate': 9.15915915915916e-07, 'epoch': 0.03} 1%| | 122/13317 [01:15<2:14:54, 1.63it/s] 1%| | 123/13317 [01:16<2:14:56, 1.63it/s] {'loss': 0.7567, 'grad_norm': 1.6613407135009766, 'learning_rate': 9.234234234234235e-07, 'epoch': 0.03} 1%| | 123/13317 [01:16<2:14:56, 1.63it/s] 1%| | 124/13317 [01:16<2:14:55, 1.63it/s] {'loss': 0.8299, 'grad_norm': 1.7441282272338867, 'learning_rate': 9.30930930930931e-07, 'epoch': 0.03} 1%| | 124/13317 [01:17<2:14:55, 1.63it/s] 1%| | 125/13317 [01:17<2:14:58, 1.63it/s] {'loss': 0.784, 'grad_norm': 1.7721879482269287, 'learning_rate': 9.384384384384384e-07, 'epoch': 0.03} 1%| | 125/13317 [01:17<2:14:58, 1.63it/s] 1%| | 126/13317 [01:18<2:15:33, 1.62it/s] {'loss': 0.6477, 'grad_norm': 1.5495719909667969, 'learning_rate': 9.459459459459461e-07, 'epoch': 0.03} 1%| | 126/13317 [01:18<2:15:33, 1.62it/s] 1%| | 127/13317 [01:18<2:15:20, 1.62it/s] {'loss': 0.753, 'grad_norm': 1.6460978984832764, 'learning_rate': 9.534534534534536e-07, 'epoch': 0.03} 1%| | 127/13317 [01:18<2:15:20, 1.62it/s] 1%| | 128/13317 [01:19<2:15:07, 1.63it/s] {'loss': 0.6963, 'grad_norm': 1.657727837562561, 'learning_rate': 9.60960960960961e-07, 'epoch': 0.03} 1%| | 128/13317 [01:19<2:15:07, 1.63it/s] 1%| | 129/13317 [01:20<2:15:05, 1.63it/s] {'loss': 0.7287, 'grad_norm': 1.6593786478042603, 'learning_rate': 9.684684684684686e-07, 'epoch': 0.03} 1%| | 129/13317 [01:20<2:15:05, 1.63it/s] 1%| | 130/13317 [01:20<2:15:07, 1.63it/s] {'loss': 0.7579, 'grad_norm': 1.6817530393600464, 'learning_rate': 9.75975975975976e-07, 'epoch': 0.03} 1%| | 130/13317 [01:20<2:15:07, 1.63it/s] 1%| | 131/13317 [01:21<2:14:56, 1.63it/s] {'loss': 0.6719, 'grad_norm': 1.4413070678710938, 'learning_rate': 9.834834834834835e-07, 'epoch': 0.03} 1%| | 131/13317 [01:21<2:14:56, 1.63it/s] 1%| | 132/13317 [01:21<2:14:53, 1.63it/s] {'loss': 0.7026, 'grad_norm': 2.2124626636505127, 'learning_rate': 9.909909909909911e-07, 'epoch': 0.03} 1%| | 132/13317 [01:21<2:14:53, 1.63it/s] 1%| | 133/13317 [01:22<2:14:51, 1.63it/s] {'loss': 0.5593, 'grad_norm': 1.3846427202224731, 'learning_rate': 9.984984984984985e-07, 'epoch': 0.03} 1%| | 133/13317 [01:22<2:14:51, 1.63it/s] 1%| | 134/13317 [01:23<2:14:54, 1.63it/s] {'loss': 0.647, 'grad_norm': 1.4754674434661865, 'learning_rate': 1.006006006006006e-06, 'epoch': 0.03} 1%| | 134/13317 [01:23<2:14:54, 1.63it/s] 1%| | 135/13317 [01:23<2:15:02, 1.63it/s] {'loss': 0.6921, 'grad_norm': 1.5349516868591309, 'learning_rate': 1.0135135135135136e-06, 'epoch': 0.03} 1%| | 135/13317 [01:23<2:15:02, 1.63it/s] 1%| | 136/13317 [01:24<2:14:53, 1.63it/s] {'loss': 0.7064, 'grad_norm': 1.4412708282470703, 'learning_rate': 1.0210210210210212e-06, 'epoch': 0.03} 1%| | 136/13317 [01:24<2:14:53, 1.63it/s] 1%| | 137/13317 [01:24<2:14:53, 1.63it/s] {'loss': 0.706, 'grad_norm': 1.5820810794830322, 'learning_rate': 1.0285285285285286e-06, 'epoch': 0.03} 1%| | 137/13317 [01:25<2:14:53, 1.63it/s] 1%| | 138/13317 [01:25<2:14:46, 1.63it/s] {'loss': 0.7606, 'grad_norm': 1.5930715799331665, 'learning_rate': 1.0360360360360361e-06, 'epoch': 0.03} 1%| | 138/13317 [01:25<2:14:46, 1.63it/s] 1%| | 139/13317 [01:26<2:14:46, 1.63it/s] {'loss': 0.6457, 'grad_norm': 1.452947735786438, 'learning_rate': 1.0435435435435435e-06, 'epoch': 0.03} 1%| | 139/13317 [01:26<2:14:46, 1.63it/s] 1%| | 140/13317 [01:26<2:15:02, 1.63it/s] {'loss': 0.5897, 'grad_norm': 1.244608998298645, 'learning_rate': 1.051051051051051e-06, 'epoch': 0.03} 1%| | 140/13317 [01:26<2:15:02, 1.63it/s] 1%| | 141/13317 [01:27<2:14:55, 1.63it/s] {'loss': 0.7207, 'grad_norm': 1.452004075050354, 'learning_rate': 1.0585585585585587e-06, 'epoch': 0.03} 1%| | 141/13317 [01:27<2:14:55, 1.63it/s] 1%| | 142/13317 [01:27<2:14:50, 1.63it/s] {'loss': 0.6814, 'grad_norm': 1.611354112625122, 'learning_rate': 1.0660660660660662e-06, 'epoch': 0.03} 1%| | 142/13317 [01:28<2:14:50, 1.63it/s] 1%| | 143/13317 [01:28<2:14:50, 1.63it/s] {'loss': 0.7198, 'grad_norm': 1.4763097763061523, 'learning_rate': 1.0735735735735736e-06, 'epoch': 0.03} 1%| | 143/13317 [01:28<2:14:50, 1.63it/s] 1%| | 144/13317 [01:29<2:14:46, 1.63it/s] {'loss': 0.582, 'grad_norm': 1.4349634647369385, 'learning_rate': 1.0810810810810812e-06, 'epoch': 0.03} 1%| | 144/13317 [01:29<2:14:46, 1.63it/s] 1%| | 145/13317 [01:29<2:14:57, 1.63it/s] {'loss': 0.6567, 'grad_norm': 1.3705693483352661, 'learning_rate': 1.0885885885885887e-06, 'epoch': 0.03} 1%| | 145/13317 [01:29<2:14:57, 1.63it/s] 1%| | 146/13317 [01:30<2:14:50, 1.63it/s] {'loss': 0.636, 'grad_norm': 1.373970627784729, 'learning_rate': 1.096096096096096e-06, 'epoch': 0.03} 1%| | 146/13317 [01:30<2:14:50, 1.63it/s] 1%| | 147/13317 [01:31<2:14:45, 1.63it/s] {'loss': 0.6333, 'grad_norm': 1.4731109142303467, 'learning_rate': 1.1036036036036037e-06, 'epoch': 0.03} 1%| | 147/13317 [01:31<2:14:45, 1.63it/s] 1%| | 148/13317 [01:31<2:14:40, 1.63it/s] {'loss': 0.7008, 'grad_norm': 1.4753167629241943, 'learning_rate': 1.111111111111111e-06, 'epoch': 0.03} 1%| | 148/13317 [01:31<2:14:40, 1.63it/s] 1%| | 149/13317 [01:32<2:14:42, 1.63it/s] {'loss': 0.7357, 'grad_norm': 1.5240771770477295, 'learning_rate': 1.1186186186186188e-06, 'epoch': 0.03} 1%| | 149/13317 [01:32<2:14:42, 1.63it/s] 1%| | 150/13317 [01:32<2:14:45, 1.63it/s] {'loss': 0.6652, 'grad_norm': 1.3055368661880493, 'learning_rate': 1.1261261261261262e-06, 'epoch': 0.03} 1%| | 150/13317 [01:33<2:14:45, 1.63it/s] 1%| | 151/13317 [01:33<2:14:39, 1.63it/s] {'loss': 0.5688, 'grad_norm': 1.1227576732635498, 'learning_rate': 1.1336336336336338e-06, 'epoch': 0.03} 1%| | 151/13317 [01:33<2:14:39, 1.63it/s] 1%| | 152/13317 [01:34<2:14:40, 1.63it/s] {'loss': 0.691, 'grad_norm': 1.3222991228103638, 'learning_rate': 1.1411411411411411e-06, 'epoch': 0.03} 1%| | 152/13317 [01:34<2:14:40, 1.63it/s] 1%| | 153/13317 [01:34<2:14:33, 1.63it/s] {'loss': 0.6524, 'grad_norm': 1.4769599437713623, 'learning_rate': 1.148648648648649e-06, 'epoch': 0.03} 1%| | 153/13317 [01:34<2:14:33, 1.63it/s] 1%| | 154/13317 [01:35<2:14:33, 1.63it/s] {'loss': 0.6384, 'grad_norm': 1.217717170715332, 'learning_rate': 1.1561561561561563e-06, 'epoch': 0.03} 1%| | 154/13317 [01:35<2:14:33, 1.63it/s] 1%| | 155/13317 [01:35<2:14:42, 1.63it/s] {'loss': 0.7631, 'grad_norm': 1.3133342266082764, 'learning_rate': 1.1636636636636638e-06, 'epoch': 0.03} 1%| | 155/13317 [01:36<2:14:42, 1.63it/s] 1%| | 156/13317 [01:36<2:14:39, 1.63it/s] {'loss': 0.6493, 'grad_norm': 1.3204808235168457, 'learning_rate': 1.1711711711711712e-06, 'epoch': 0.04} 1%| | 156/13317 [01:36<2:14:39, 1.63it/s] 1%| | 157/13317 [01:37<2:14:37, 1.63it/s] {'loss': 0.4938, 'grad_norm': 1.0120199918746948, 'learning_rate': 1.1786786786786788e-06, 'epoch': 0.04} 1%| | 157/13317 [01:37<2:14:37, 1.63it/s] 1%| | 158/13317 [01:37<2:14:39, 1.63it/s] {'loss': 0.4971, 'grad_norm': 0.848904013633728, 'learning_rate': 1.1861861861861864e-06, 'epoch': 0.04} 1%| | 158/13317 [01:37<2:14:39, 1.63it/s] 1%| | 159/13317 [01:38<2:14:34, 1.63it/s] {'loss': 0.5508, 'grad_norm': 1.0298947095870972, 'learning_rate': 1.1936936936936937e-06, 'epoch': 0.04} 1%| | 159/13317 [01:38<2:14:34, 1.63it/s] 1%| | 160/13317 [01:39<2:14:41, 1.63it/s] {'loss': 0.5656, 'grad_norm': 1.0605055093765259, 'learning_rate': 1.2012012012012013e-06, 'epoch': 0.04} 1%| | 160/13317 [01:39<2:14:41, 1.63it/s] 1%| | 161/13317 [01:39<2:14:43, 1.63it/s] {'loss': 0.5773, 'grad_norm': 1.1684128046035767, 'learning_rate': 1.2087087087087089e-06, 'epoch': 0.04} 1%| | 161/13317 [01:39<2:14:43, 1.63it/s] 1%| | 162/13317 [01:40<2:14:42, 1.63it/s] {'loss': 0.6469, 'grad_norm': 1.3407416343688965, 'learning_rate': 1.2162162162162164e-06, 'epoch': 0.04} 1%| | 162/13317 [01:40<2:14:42, 1.63it/s] 1%| | 163/13317 [01:40<2:14:38, 1.63it/s] {'loss': 0.5085, 'grad_norm': 0.8725384473800659, 'learning_rate': 1.2237237237237238e-06, 'epoch': 0.04} 1%| | 163/13317 [01:41<2:14:38, 1.63it/s] 1%| | 164/13317 [01:41<2:14:33, 1.63it/s] {'loss': 0.6691, 'grad_norm': 1.4522643089294434, 'learning_rate': 1.2312312312312314e-06, 'epoch': 0.04} 1%| | 164/13317 [01:41<2:14:33, 1.63it/s] 1%| | 165/13317 [01:42<2:14:54, 1.62it/s] {'loss': 0.5529, 'grad_norm': 1.0491087436676025, 'learning_rate': 1.2387387387387387e-06, 'epoch': 0.04} 1%| | 165/13317 [01:42<2:14:54, 1.62it/s] 1%| | 166/13317 [01:42<2:14:42, 1.63it/s] {'loss': 0.553, 'grad_norm': 1.1312469244003296, 'learning_rate': 1.2462462462462463e-06, 'epoch': 0.04} 1%| | 166/13317 [01:42<2:14:42, 1.63it/s] 1%|▏ | 167/13317 [01:43<2:14:36, 1.63it/s] {'loss': 0.5845, 'grad_norm': 1.0458314418792725, 'learning_rate': 1.2537537537537539e-06, 'epoch': 0.04} 1%|▏ | 167/13317 [01:43<2:14:36, 1.63it/s] 1%|▏ | 168/13317 [01:43<2:14:32, 1.63it/s] {'loss': 0.621, 'grad_norm': 1.118099331855774, 'learning_rate': 1.2612612612612613e-06, 'epoch': 0.04} 1%|▏ | 168/13317 [01:44<2:14:32, 1.63it/s] 1%|▏ | 169/13317 [01:44<2:14:25, 1.63it/s] {'loss': 0.6034, 'grad_norm': 1.1403764486312866, 'learning_rate': 1.2687687687687688e-06, 'epoch': 0.04} 1%|▏ | 169/13317 [01:44<2:14:25, 1.63it/s] 1%|▏ | 170/13317 [01:45<2:15:06, 1.62it/s] {'loss': 0.5307, 'grad_norm': 0.9272016286849976, 'learning_rate': 1.2762762762762764e-06, 'epoch': 0.04} 1%|▏ | 170/13317 [01:45<2:15:06, 1.62it/s] 1%|▏ | 171/13317 [01:45<2:14:52, 1.62it/s] {'loss': 0.5721, 'grad_norm': 1.065805435180664, 'learning_rate': 1.2837837837837838e-06, 'epoch': 0.04} 1%|▏ | 171/13317 [01:45<2:14:52, 1.62it/s] 1%|▏ | 172/13317 [01:46<2:14:42, 1.63it/s] {'loss': 0.5909, 'grad_norm': 1.014577865600586, 'learning_rate': 1.2912912912912913e-06, 'epoch': 0.04} 1%|▏ | 172/13317 [01:46<2:14:42, 1.63it/s] 1%|▏ | 173/13317 [01:47<2:14:35, 1.63it/s] {'loss': 0.573, 'grad_norm': 1.0064231157302856, 'learning_rate': 1.2987987987987987e-06, 'epoch': 0.04} 1%|▏ | 173/13317 [01:47<2:14:35, 1.63it/s] 1%|▏ | 174/13317 [01:47<2:14:34, 1.63it/s] {'loss': 0.5824, 'grad_norm': 1.1200244426727295, 'learning_rate': 1.3063063063063065e-06, 'epoch': 0.04} 1%|▏ | 174/13317 [01:47<2:14:34, 1.63it/s] 1%|▏ | 175/13317 [01:48<2:14:40, 1.63it/s] {'loss': 0.4886, 'grad_norm': 0.9291266798973083, 'learning_rate': 1.313813813813814e-06, 'epoch': 0.04} 1%|▏ | 175/13317 [01:48<2:14:40, 1.63it/s] 1%|▏ | 176/13317 [01:48<2:14:32, 1.63it/s] {'loss': 0.4236, 'grad_norm': 0.7707609534263611, 'learning_rate': 1.3213213213213214e-06, 'epoch': 0.04} 1%|▏ | 176/13317 [01:49<2:14:32, 1.63it/s] 1%|▏ | 177/13317 [01:49<2:14:27, 1.63it/s] {'loss': 0.5926, 'grad_norm': 1.1367814540863037, 'learning_rate': 1.328828828828829e-06, 'epoch': 0.04} 1%|▏ | 177/13317 [01:49<2:14:27, 1.63it/s] 1%|▏ | 178/13317 [01:50<2:14:27, 1.63it/s] {'loss': 0.6172, 'grad_norm': 1.0455306768417358, 'learning_rate': 1.3363363363363366e-06, 'epoch': 0.04} 1%|▏ | 178/13317 [01:50<2:14:27, 1.63it/s] 1%|▏ | 179/13317 [01:50<2:14:25, 1.63it/s] {'loss': 0.5864, 'grad_norm': 1.0398110151290894, 'learning_rate': 1.343843843843844e-06, 'epoch': 0.04} 1%|▏ | 179/13317 [01:50<2:14:25, 1.63it/s] 1%|▏ | 180/13317 [01:51<2:14:28, 1.63it/s] {'loss': 0.5299, 'grad_norm': 1.0818138122558594, 'learning_rate': 1.3513513513513515e-06, 'epoch': 0.04} 1%|▏ | 180/13317 [01:51<2:14:28, 1.63it/s] 1%|▏ | 181/13317 [01:51<2:14:23, 1.63it/s] {'loss': 0.6409, 'grad_norm': 1.0900903940200806, 'learning_rate': 1.3588588588588589e-06, 'epoch': 0.04} 1%|▏ | 181/13317 [01:52<2:14:23, 1.63it/s] 1%|▏ | 182/13317 [01:52<2:14:24, 1.63it/s] {'loss': 0.4352, 'grad_norm': 0.8942669034004211, 'learning_rate': 1.3663663663663664e-06, 'epoch': 0.04} 1%|▏ | 182/13317 [01:52<2:14:24, 1.63it/s] 1%|▏ | 183/13317 [01:53<2:14:24, 1.63it/s] {'loss': 0.6471, 'grad_norm': 1.1304272413253784, 'learning_rate': 1.373873873873874e-06, 'epoch': 0.04} 1%|▏ | 183/13317 [01:53<2:14:24, 1.63it/s] 1%|▏ | 184/13317 [01:53<2:14:18, 1.63it/s] {'loss': 0.4891, 'grad_norm': 0.8067008256912231, 'learning_rate': 1.3813813813813814e-06, 'epoch': 0.04} 1%|▏ | 184/13317 [01:53<2:14:18, 1.63it/s] 1%|▏ | 185/13317 [01:54<2:14:25, 1.63it/s] {'loss': 0.5565, 'grad_norm': 0.9788805246353149, 'learning_rate': 1.3888888888888892e-06, 'epoch': 0.04} 1%|▏ | 185/13317 [01:54<2:14:25, 1.63it/s] 1%|▏ | 186/13317 [01:55<2:14:24, 1.63it/s] {'loss': 0.5082, 'grad_norm': 1.0063388347625732, 'learning_rate': 1.3963963963963963e-06, 'epoch': 0.04} 1%|▏ | 186/13317 [01:55<2:14:24, 1.63it/s] 1%|▏ | 187/13317 [01:55<2:14:21, 1.63it/s] {'loss': 0.6214, 'grad_norm': 1.0533509254455566, 'learning_rate': 1.403903903903904e-06, 'epoch': 0.04} 1%|▏ | 187/13317 [01:55<2:14:21, 1.63it/s] 1%|▏ | 188/13317 [01:56<2:14:19, 1.63it/s] {'loss': 0.5873, 'grad_norm': 1.0716609954833984, 'learning_rate': 1.4114114114114117e-06, 'epoch': 0.04} 1%|▏ | 188/13317 [01:56<2:14:19, 1.63it/s] 1%|▏ | 189/13317 [01:56<2:14:20, 1.63it/s] {'loss': 0.6015, 'grad_norm': 0.918181836605072, 'learning_rate': 1.418918918918919e-06, 'epoch': 0.04} 1%|▏ | 189/13317 [01:56<2:14:20, 1.63it/s] 1%|▏ | 190/13317 [01:57<2:14:22, 1.63it/s] {'loss': 0.4706, 'grad_norm': 0.8621892333030701, 'learning_rate': 1.4264264264264266e-06, 'epoch': 0.04} 1%|▏ | 190/13317 [01:57<2:14:22, 1.63it/s] 1%|▏ | 191/13317 [01:58<2:14:20, 1.63it/s] {'loss': 0.6133, 'grad_norm': 1.162561297416687, 'learning_rate': 1.433933933933934e-06, 'epoch': 0.04} 1%|▏ | 191/13317 [01:58<2:14:20, 1.63it/s] 1%|▏ | 192/13317 [01:58<2:14:15, 1.63it/s] {'loss': 0.6365, 'grad_norm': 1.1295325756072998, 'learning_rate': 1.4414414414414416e-06, 'epoch': 0.04} 1%|▏ | 192/13317 [01:58<2:14:15, 1.63it/s] 1%|▏ | 193/13317 [01:59<2:14:14, 1.63it/s] {'loss': 0.5639, 'grad_norm': 1.0269994735717773, 'learning_rate': 1.4489489489489491e-06, 'epoch': 0.04} 1%|▏ | 193/13317 [01:59<2:14:14, 1.63it/s] 1%|▏ | 194/13317 [01:59<2:14:14, 1.63it/s] {'loss': 0.5096, 'grad_norm': 1.001244068145752, 'learning_rate': 1.4564564564564565e-06, 'epoch': 0.04} 1%|▏ | 194/13317 [02:00<2:14:14, 1.63it/s] 1%|▏ | 195/13317 [02:00<2:14:18, 1.63it/s] {'loss': 0.5165, 'grad_norm': 1.137724757194519, 'learning_rate': 1.463963963963964e-06, 'epoch': 0.04} 1%|▏ | 195/13317 [02:00<2:14:18, 1.63it/s] 1%|▏ | 196/13317 [02:01<2:14:16, 1.63it/s] {'loss': 0.5814, 'grad_norm': 0.8230946660041809, 'learning_rate': 1.4714714714714714e-06, 'epoch': 0.04} 1%|▏ | 196/13317 [02:01<2:14:16, 1.63it/s] 1%|▏ | 197/13317 [02:01<2:14:11, 1.63it/s] {'loss': 0.5278, 'grad_norm': 0.7805623412132263, 'learning_rate': 1.478978978978979e-06, 'epoch': 0.04} 1%|▏ | 197/13317 [02:01<2:14:11, 1.63it/s] 1%|▏ | 198/13317 [02:02<2:14:09, 1.63it/s] {'loss': 0.5008, 'grad_norm': 0.9030367732048035, 'learning_rate': 1.4864864864864868e-06, 'epoch': 0.04} 1%|▏ | 198/13317 [02:02<2:14:09, 1.63it/s] 1%|▏ | 199/13317 [02:02<2:14:16, 1.63it/s] {'loss': 0.5205, 'grad_norm': 0.9333950281143188, 'learning_rate': 1.493993993993994e-06, 'epoch': 0.04} 1%|▏ | 199/13317 [02:03<2:14:16, 1.63it/s] 2%|▏ | 200/13317 [02:03<2:14:22, 1.63it/s] {'loss': 0.5589, 'grad_norm': 0.9850483536720276, 'learning_rate': 1.5015015015015017e-06, 'epoch': 0.05} 2%|▏ | 200/13317 [02:03<2:14:22, 1.63it/s] 2%|▏ | 201/13317 [02:04<2:14:22, 1.63it/s] {'loss': 0.4891, 'grad_norm': 0.9021425247192383, 'learning_rate': 1.5090090090090093e-06, 'epoch': 0.05} 2%|▏ | 201/13317 [02:04<2:14:22, 1.63it/s] 2%|▏ | 202/13317 [02:04<2:14:36, 1.62it/s] {'loss': 0.534, 'grad_norm': 1.1325154304504395, 'learning_rate': 1.5165165165165167e-06, 'epoch': 0.05} 2%|▏ | 202/13317 [02:04<2:14:36, 1.62it/s] 2%|▏ | 203/13317 [02:05<2:14:18, 1.63it/s] {'loss': 0.6164, 'grad_norm': 1.1065471172332764, 'learning_rate': 1.5240240240240242e-06, 'epoch': 0.05} 2%|▏ | 203/13317 [02:05<2:14:18, 1.63it/s] 2%|▏ | 204/13317 [02:06<2:14:32, 1.62it/s] {'loss': 0.5049, 'grad_norm': 1.1567038297653198, 'learning_rate': 1.5315315315315316e-06, 'epoch': 0.05} 2%|▏ | 204/13317 [02:06<2:14:32, 1.62it/s] 2%|▏ | 205/13317 [02:06<2:14:35, 1.62it/s] {'loss': 0.6127, 'grad_norm': 0.8912550806999207, 'learning_rate': 1.5390390390390392e-06, 'epoch': 0.05} 2%|▏ | 205/13317 [02:06<2:14:35, 1.62it/s] 2%|▏ | 206/13317 [02:07<2:14:27, 1.63it/s] {'loss': 0.5048, 'grad_norm': 0.9775712490081787, 'learning_rate': 1.5465465465465467e-06, 'epoch': 0.05} 2%|▏ | 206/13317 [02:07<2:14:27, 1.63it/s] 2%|▏ | 207/13317 [02:07<2:14:22, 1.63it/s] {'loss': 0.6139, 'grad_norm': 1.0000442266464233, 'learning_rate': 1.5540540540540541e-06, 'epoch': 0.05} 2%|▏ | 207/13317 [02:08<2:14:22, 1.63it/s] 2%|▏ | 208/13317 [02:08<2:14:15, 1.63it/s] {'loss': 0.5767, 'grad_norm': 0.9816790223121643, 'learning_rate': 1.5615615615615617e-06, 'epoch': 0.05} 2%|▏ | 208/13317 [02:08<2:14:15, 1.63it/s] 2%|▏ | 209/13317 [02:09<2:14:06, 1.63it/s] {'loss': 0.5635, 'grad_norm': 0.9469976425170898, 'learning_rate': 1.569069069069069e-06, 'epoch': 0.05} 2%|▏ | 209/13317 [02:09<2:14:06, 1.63it/s] 2%|▏ | 210/13317 [02:09<2:14:15, 1.63it/s] {'loss': 0.5211, 'grad_norm': 1.0825903415679932, 'learning_rate': 1.5765765765765766e-06, 'epoch': 0.05} 2%|▏ | 210/13317 [02:09<2:14:15, 1.63it/s] 2%|▏ | 211/13317 [02:10<2:14:16, 1.63it/s] {'loss': 0.5048, 'grad_norm': 0.8686360716819763, 'learning_rate': 1.5840840840840844e-06, 'epoch': 0.05} 2%|▏ | 211/13317 [02:10<2:14:16, 1.63it/s] 2%|▏ | 212/13317 [02:10<2:14:10, 1.63it/s] {'loss': 0.6054, 'grad_norm': 1.098149061203003, 'learning_rate': 1.5915915915915916e-06, 'epoch': 0.05} 2%|▏ | 212/13317 [02:11<2:14:10, 1.63it/s] 2%|▏ | 213/13317 [02:11<2:14:05, 1.63it/s] {'loss': 0.5983, 'grad_norm': 0.9329115152359009, 'learning_rate': 1.5990990990990993e-06, 'epoch': 0.05} 2%|▏ | 213/13317 [02:11<2:14:05, 1.63it/s] 2%|▏ | 214/13317 [02:12<2:14:07, 1.63it/s] {'loss': 0.5157, 'grad_norm': 0.8366087079048157, 'learning_rate': 1.6066066066066067e-06, 'epoch': 0.05} 2%|▏ | 214/13317 [02:12<2:14:07, 1.63it/s] 2%|▏ | 215/13317 [02:12<2:14:04, 1.63it/s] {'loss': 0.455, 'grad_norm': 0.8130690455436707, 'learning_rate': 1.6141141141141143e-06, 'epoch': 0.05} 2%|▏ | 215/13317 [02:12<2:14:04, 1.63it/s] 2%|▏ | 216/13317 [02:13<2:14:01, 1.63it/s] {'loss': 0.5575, 'grad_norm': 0.945459246635437, 'learning_rate': 1.6216216216216219e-06, 'epoch': 0.05} 2%|▏ | 216/13317 [02:13<2:14:01, 1.63it/s] 2%|▏ | 217/13317 [02:14<2:14:01, 1.63it/s] {'loss': 0.5237, 'grad_norm': 0.9335184097290039, 'learning_rate': 1.6291291291291292e-06, 'epoch': 0.05} 2%|▏ | 217/13317 [02:14<2:14:01, 1.63it/s] 2%|▏ | 218/13317 [02:14<2:13:56, 1.63it/s] {'loss': 0.542, 'grad_norm': 0.8258176445960999, 'learning_rate': 1.6366366366366368e-06, 'epoch': 0.05} 2%|▏ | 218/13317 [02:14<2:13:56, 1.63it/s] 2%|▏ | 219/13317 [02:15<2:13:54, 1.63it/s] {'loss': 0.6317, 'grad_norm': 1.1205437183380127, 'learning_rate': 1.6441441441441444e-06, 'epoch': 0.05} 2%|▏ | 219/13317 [02:15<2:13:54, 1.63it/s] 2%|▏ | 220/13317 [02:15<2:13:55, 1.63it/s] {'loss': 0.513, 'grad_norm': 1.0060783624649048, 'learning_rate': 1.6516516516516517e-06, 'epoch': 0.05} 2%|▏ | 220/13317 [02:16<2:13:55, 1.63it/s] 2%|▏ | 221/13317 [02:16<2:13:51, 1.63it/s] {'loss': 0.4756, 'grad_norm': 0.8755509257316589, 'learning_rate': 1.6591591591591593e-06, 'epoch': 0.05} 2%|▏ | 221/13317 [02:16<2:13:51, 1.63it/s] 2%|▏ | 222/13317 [02:17<2:13:51, 1.63it/s] {'loss': 0.4336, 'grad_norm': 0.7452938556671143, 'learning_rate': 1.6666666666666667e-06, 'epoch': 0.05} 2%|▏ | 222/13317 [02:17<2:13:51, 1.63it/s] 2%|▏ | 223/13317 [02:17<2:13:48, 1.63it/s] {'loss': 0.6685, 'grad_norm': 0.8382483720779419, 'learning_rate': 1.6741741741741742e-06, 'epoch': 0.05} 2%|▏ | 223/13317 [02:17<2:13:48, 1.63it/s] 2%|▏ | 224/13317 [02:18<2:13:48, 1.63it/s] {'loss': 0.4601, 'grad_norm': 0.8271287083625793, 'learning_rate': 1.681681681681682e-06, 'epoch': 0.05} 2%|▏ | 224/13317 [02:18<2:13:48, 1.63it/s] 2%|▏ | 225/13317 [02:18<2:13:51, 1.63it/s] {'loss': 0.5016, 'grad_norm': 0.8500949144363403, 'learning_rate': 1.6891891891891894e-06, 'epoch': 0.05} 2%|▏ | 225/13317 [02:19<2:13:51, 1.63it/s] 2%|▏ | 226/13317 [02:19<2:13:49, 1.63it/s] {'loss': 0.4385, 'grad_norm': 0.7964163422584534, 'learning_rate': 1.696696696696697e-06, 'epoch': 0.05} 2%|▏ | 226/13317 [02:19<2:13:49, 1.63it/s] 2%|▏ | 227/13317 [02:20<2:13:50, 1.63it/s] {'loss': 0.3599, 'grad_norm': 0.7702478766441345, 'learning_rate': 1.7042042042042043e-06, 'epoch': 0.05} 2%|▏ | 227/13317 [02:20<2:13:50, 1.63it/s] 2%|▏ | 228/13317 [02:20<2:13:50, 1.63it/s] {'loss': 0.4629, 'grad_norm': 0.7608579993247986, 'learning_rate': 1.711711711711712e-06, 'epoch': 0.05} 2%|▏ | 228/13317 [02:20<2:13:50, 1.63it/s] 2%|▏ | 229/13317 [02:21<2:13:54, 1.63it/s] {'loss': 0.5383, 'grad_norm': 0.899407684803009, 'learning_rate': 1.7192192192192195e-06, 'epoch': 0.05} 2%|▏ | 229/13317 [02:21<2:13:54, 1.63it/s] 2%|▏ | 230/13317 [02:22<2:13:48, 1.63it/s] {'loss': 0.5725, 'grad_norm': 0.8703835010528564, 'learning_rate': 1.7267267267267268e-06, 'epoch': 0.05} 2%|▏ | 230/13317 [02:22<2:13:48, 1.63it/s] 2%|▏ | 231/13317 [02:22<2:13:49, 1.63it/s] {'loss': 0.5911, 'grad_norm': 0.8942667245864868, 'learning_rate': 1.7342342342342344e-06, 'epoch': 0.05} 2%|▏ | 231/13317 [02:22<2:13:49, 1.63it/s] 2%|▏ | 232/13317 [02:23<2:13:50, 1.63it/s] {'loss': 0.468, 'grad_norm': 0.8283909559249878, 'learning_rate': 1.7417417417417418e-06, 'epoch': 0.05} 2%|▏ | 232/13317 [02:23<2:13:50, 1.63it/s] 2%|▏ | 233/13317 [02:23<2:13:47, 1.63it/s] {'loss': 0.4996, 'grad_norm': 0.8855001926422119, 'learning_rate': 1.7492492492492493e-06, 'epoch': 0.05} 2%|▏ | 233/13317 [02:24<2:13:47, 1.63it/s] 2%|▏ | 234/13317 [02:24<2:13:46, 1.63it/s] {'loss': 0.7024, 'grad_norm': 0.9389349222183228, 'learning_rate': 1.756756756756757e-06, 'epoch': 0.05} 2%|▏ | 234/13317 [02:24<2:13:46, 1.63it/s] 2%|▏ | 235/13317 [02:25<2:13:48, 1.63it/s] {'loss': 0.4885, 'grad_norm': 0.8391992449760437, 'learning_rate': 1.7642642642642643e-06, 'epoch': 0.05} 2%|▏ | 235/13317 [02:25<2:13:48, 1.63it/s] 2%|▏ | 236/13317 [02:25<2:13:45, 1.63it/s] {'loss': 0.4532, 'grad_norm': 0.9417365789413452, 'learning_rate': 1.7717717717717719e-06, 'epoch': 0.05} 2%|▏ | 236/13317 [02:25<2:13:45, 1.63it/s] 2%|▏ | 237/13317 [02:26<2:13:42, 1.63it/s] {'loss': 0.5012, 'grad_norm': 0.8652413487434387, 'learning_rate': 1.7792792792792792e-06, 'epoch': 0.05} 2%|▏ | 237/13317 [02:26<2:13:42, 1.63it/s] 2%|▏ | 238/13317 [02:26<2:13:40, 1.63it/s] {'loss': 0.5094, 'grad_norm': 0.9209291338920593, 'learning_rate': 1.786786786786787e-06, 'epoch': 0.05} 2%|▏ | 238/13317 [02:27<2:13:40, 1.63it/s] 2%|▏ | 239/13317 [02:27<2:13:41, 1.63it/s] {'loss': 0.3937, 'grad_norm': 0.5748335719108582, 'learning_rate': 1.7942942942942946e-06, 'epoch': 0.05} 2%|▏ | 239/13317 [02:27<2:13:41, 1.63it/s] 2%|▏ | 240/13317 [02:28<2:13:40, 1.63it/s] {'loss': 0.502, 'grad_norm': 0.9317153692245483, 'learning_rate': 1.801801801801802e-06, 'epoch': 0.05} 2%|▏ | 240/13317 [02:28<2:13:40, 1.63it/s] 2%|▏ | 241/13317 [02:28<2:13:37, 1.63it/s] {'loss': 0.4844, 'grad_norm': 0.9431211352348328, 'learning_rate': 1.8093093093093095e-06, 'epoch': 0.05} 2%|▏ | 241/13317 [02:28<2:13:37, 1.63it/s] 2%|▏ | 242/13317 [02:29<2:13:36, 1.63it/s] {'loss': 0.399, 'grad_norm': 0.7144924998283386, 'learning_rate': 1.816816816816817e-06, 'epoch': 0.05} 2%|▏ | 242/13317 [02:29<2:13:36, 1.63it/s] 2%|▏ | 243/13317 [02:30<2:13:38, 1.63it/s] {'loss': 0.5149, 'grad_norm': 0.8570777773857117, 'learning_rate': 1.8243243243243245e-06, 'epoch': 0.05} 2%|▏ | 243/13317 [02:30<2:13:38, 1.63it/s] 2%|▏ | 244/13317 [02:30<2:13:37, 1.63it/s] {'loss': 0.4967, 'grad_norm': 0.8890277743339539, 'learning_rate': 1.831831831831832e-06, 'epoch': 0.05} 2%|▏ | 244/13317 [02:30<2:13:37, 1.63it/s] 2%|▏ | 245/13317 [02:31<2:13:39, 1.63it/s] {'loss': 0.5036, 'grad_norm': 0.8584218621253967, 'learning_rate': 1.8393393393393394e-06, 'epoch': 0.06} 2%|▏ | 245/13317 [02:31<2:13:39, 1.63it/s] 2%|▏ | 246/13317 [02:31<2:13:38, 1.63it/s] {'loss': 0.584, 'grad_norm': 0.9183411002159119, 'learning_rate': 1.846846846846847e-06, 'epoch': 0.06} 2%|▏ | 246/13317 [02:31<2:13:38, 1.63it/s] 2%|▏ | 247/13317 [02:32<2:13:36, 1.63it/s] {'loss': 0.6233, 'grad_norm': 0.933334469795227, 'learning_rate': 1.8543543543543545e-06, 'epoch': 0.06} 2%|▏ | 247/13317 [02:32<2:13:36, 1.63it/s] 2%|▏ | 248/13317 [02:33<2:13:37, 1.63it/s] {'loss': 0.4422, 'grad_norm': 0.7942708134651184, 'learning_rate': 1.861861861861862e-06, 'epoch': 0.06} 2%|▏ | 248/13317 [02:33<2:13:37, 1.63it/s] 2%|▏ | 249/13317 [02:33<2:13:34, 1.63it/s] {'loss': 0.5316, 'grad_norm': 0.8726782202720642, 'learning_rate': 1.8693693693693697e-06, 'epoch': 0.06} 2%|▏ | 249/13317 [02:33<2:13:34, 1.63it/s] 2%|▏ | 250/13317 [02:34<2:13:36, 1.63it/s] {'loss': 0.575, 'grad_norm': 0.8334015011787415, 'learning_rate': 1.8768768768768768e-06, 'epoch': 0.06} 2%|▏ | 250/13317 [02:34<2:13:36, 1.63it/s] 2%|▏ | 251/13317 [02:34<2:13:34, 1.63it/s] {'loss': 0.5157, 'grad_norm': 0.8072992563247681, 'learning_rate': 1.8843843843843846e-06, 'epoch': 0.06} 2%|▏ | 251/13317 [02:35<2:13:34, 1.63it/s] 2%|▏ | 252/13317 [02:35<2:13:30, 1.63it/s] {'loss': 0.4754, 'grad_norm': 0.7553905248641968, 'learning_rate': 1.8918918918918922e-06, 'epoch': 0.06} 2%|▏ | 252/13317 [02:35<2:13:30, 1.63it/s] 2%|▏ | 253/13317 [02:36<2:13:30, 1.63it/s] {'loss': 0.5008, 'grad_norm': 0.795973002910614, 'learning_rate': 1.8993993993993996e-06, 'epoch': 0.06} 2%|▏ | 253/13317 [02:36<2:13:30, 1.63it/s] 2%|▏ | 254/13317 [02:36<2:13:31, 1.63it/s] {'loss': 0.5195, 'grad_norm': 0.8347752094268799, 'learning_rate': 1.9069069069069071e-06, 'epoch': 0.06} 2%|▏ | 254/13317 [02:36<2:13:31, 1.63it/s] 2%|▏ | 255/13317 [02:37<2:13:28, 1.63it/s] {'loss': 0.5405, 'grad_norm': 0.9817022085189819, 'learning_rate': 1.9144144144144145e-06, 'epoch': 0.06} 2%|▏ | 255/13317 [02:37<2:13:28, 1.63it/s] 2%|▏ | 256/13317 [02:37<2:13:27, 1.63it/s] {'loss': 0.4624, 'grad_norm': 0.8740941882133484, 'learning_rate': 1.921921921921922e-06, 'epoch': 0.06} 2%|▏ | 256/13317 [02:38<2:13:27, 1.63it/s] 2%|▏ | 257/13317 [02:38<2:13:30, 1.63it/s] {'loss': 0.5269, 'grad_norm': 0.7141740322113037, 'learning_rate': 1.9294294294294296e-06, 'epoch': 0.06} 2%|▏ | 257/13317 [02:38<2:13:30, 1.63it/s] 2%|▏ | 258/13317 [02:39<2:13:31, 1.63it/s] {'loss': 0.6435, 'grad_norm': 0.9771051406860352, 'learning_rate': 1.9369369369369372e-06, 'epoch': 0.06} 2%|▏ | 258/13317 [02:39<2:13:31, 1.63it/s] 2%|▏ | 259/13317 [02:39<2:13:31, 1.63it/s] {'loss': 0.5391, 'grad_norm': 0.8392442464828491, 'learning_rate': 1.944444444444445e-06, 'epoch': 0.06} 2%|▏ | 259/13317 [02:39<2:13:31, 1.63it/s] 2%|▏ | 260/13317 [02:40<2:13:31, 1.63it/s] {'loss': 0.4884, 'grad_norm': 0.7445134520530701, 'learning_rate': 1.951951951951952e-06, 'epoch': 0.06} 2%|▏ | 260/13317 [02:40<2:13:31, 1.63it/s] 2%|▏ | 261/13317 [02:41<2:13:33, 1.63it/s] {'loss': 0.5251, 'grad_norm': 0.943398118019104, 'learning_rate': 1.9594594594594595e-06, 'epoch': 0.06} 2%|▏ | 261/13317 [02:41<2:13:33, 1.63it/s] 2%|▏ | 262/13317 [02:41<2:13:29, 1.63it/s] {'loss': 0.5295, 'grad_norm': 0.944298267364502, 'learning_rate': 1.966966966966967e-06, 'epoch': 0.06} 2%|▏ | 262/13317 [02:41<2:13:29, 1.63it/s] 2%|▏ | 263/13317 [02:42<2:13:30, 1.63it/s] {'loss': 0.4428, 'grad_norm': 0.69712895154953, 'learning_rate': 1.9744744744744747e-06, 'epoch': 0.06} 2%|▏ | 263/13317 [02:42<2:13:30, 1.63it/s] 2%|▏ | 264/13317 [02:42<2:13:32, 1.63it/s] {'loss': 0.4701, 'grad_norm': 0.7624127268791199, 'learning_rate': 1.9819819819819822e-06, 'epoch': 0.06} 2%|▏ | 264/13317 [02:43<2:13:32, 1.63it/s] 2%|▏ | 265/13317 [02:43<2:13:30, 1.63it/s] {'loss': 0.4866, 'grad_norm': 0.7792107462882996, 'learning_rate': 1.98948948948949e-06, 'epoch': 0.06} 2%|▏ | 265/13317 [02:43<2:13:30, 1.63it/s] 2%|▏ | 266/13317 [02:44<2:13:27, 1.63it/s] {'loss': 0.5844, 'grad_norm': 0.8491565585136414, 'learning_rate': 1.996996996996997e-06, 'epoch': 0.06} 2%|▏ | 266/13317 [02:44<2:13:27, 1.63it/s] 2%|▏ | 267/13317 [02:44<2:13:26, 1.63it/s] {'loss': 0.5189, 'grad_norm': 0.8475441336631775, 'learning_rate': 2.0045045045045045e-06, 'epoch': 0.06} 2%|▏ | 267/13317 [02:44<2:13:26, 1.63it/s] 2%|▏ | 268/13317 [02:45<2:13:23, 1.63it/s] {'loss': 0.5341, 'grad_norm': 0.8206396102905273, 'learning_rate': 2.012012012012012e-06, 'epoch': 0.06} 2%|▏ | 268/13317 [02:45<2:13:23, 1.63it/s] 2%|▏ | 269/13317 [02:45<2:13:24, 1.63it/s] {'loss': 0.5189, 'grad_norm': 0.841033399105072, 'learning_rate': 2.0195195195195197e-06, 'epoch': 0.06} 2%|▏ | 269/13317 [02:46<2:13:24, 1.63it/s] 2%|▏ | 270/13317 [02:46<2:13:25, 1.63it/s] {'loss': 0.5185, 'grad_norm': 0.7907808423042297, 'learning_rate': 2.0270270270270273e-06, 'epoch': 0.06} 2%|▏ | 270/13317 [02:46<2:13:25, 1.63it/s] 2%|▏ | 271/13317 [02:47<2:13:23, 1.63it/s] {'loss': 0.5065, 'grad_norm': 0.9445785284042358, 'learning_rate': 2.034534534534535e-06, 'epoch': 0.06} 2%|▏ | 271/13317 [02:47<2:13:23, 1.63it/s] 2%|▏ | 272/13317 [02:47<2:13:28, 1.63it/s] {'loss': 0.5054, 'grad_norm': 0.8224158883094788, 'learning_rate': 2.0420420420420424e-06, 'epoch': 0.06} 2%|▏ | 272/13317 [02:47<2:13:28, 1.63it/s] 2%|▏ | 273/13317 [02:48<2:13:31, 1.63it/s] {'loss': 0.5465, 'grad_norm': 0.7618244290351868, 'learning_rate': 2.0495495495495496e-06, 'epoch': 0.06} 2%|▏ | 273/13317 [02:48<2:13:31, 1.63it/s] 2%|▏ | 274/13317 [02:49<2:13:33, 1.63it/s] {'loss': 0.5051, 'grad_norm': 0.8075991272926331, 'learning_rate': 2.057057057057057e-06, 'epoch': 0.06} 2%|▏ | 274/13317 [02:49<2:13:33, 1.63it/s] 2%|▏ | 275/13317 [02:49<2:13:31, 1.63it/s] {'loss': 0.5658, 'grad_norm': 0.8247211575508118, 'learning_rate': 2.0645645645645647e-06, 'epoch': 0.06} 2%|▏ | 275/13317 [02:49<2:13:31, 1.63it/s] 2%|▏ | 276/13317 [02:50<2:13:38, 1.63it/s] {'loss': 0.4855, 'grad_norm': 0.7994195818901062, 'learning_rate': 2.0720720720720723e-06, 'epoch': 0.06} 2%|▏ | 276/13317 [02:50<2:13:38, 1.63it/s] 2%|▏ | 277/13317 [02:50<2:13:55, 1.62it/s] {'loss': 0.532, 'grad_norm': 0.7584813833236694, 'learning_rate': 2.07957957957958e-06, 'epoch': 0.06} 2%|▏ | 277/13317 [02:51<2:13:55, 1.62it/s] 2%|▏ | 278/13317 [02:51<2:13:43, 1.63it/s] {'loss': 0.5082, 'grad_norm': 0.8021715879440308, 'learning_rate': 2.087087087087087e-06, 'epoch': 0.06} 2%|▏ | 278/13317 [02:51<2:13:43, 1.63it/s] 2%|▏ | 279/13317 [02:52<2:13:34, 1.63it/s] {'loss': 0.4618, 'grad_norm': 0.7866194248199463, 'learning_rate': 2.0945945945945946e-06, 'epoch': 0.06} 2%|▏ | 279/13317 [02:52<2:13:34, 1.63it/s] 2%|▏ | 280/13317 [02:52<2:14:15, 1.62it/s] {'loss': 0.4097, 'grad_norm': 0.6870883703231812, 'learning_rate': 2.102102102102102e-06, 'epoch': 0.06} 2%|▏ | 280/13317 [02:52<2:14:15, 1.62it/s] 2%|▏ | 281/13317 [02:53<2:14:01, 1.62it/s] {'loss': 0.4818, 'grad_norm': 0.9430485963821411, 'learning_rate': 2.1096096096096097e-06, 'epoch': 0.06} 2%|▏ | 281/13317 [02:53<2:14:01, 1.62it/s] 2%|▏ | 282/13317 [02:53<2:13:47, 1.62it/s] {'loss': 0.4226, 'grad_norm': 0.6739403605461121, 'learning_rate': 2.1171171171171173e-06, 'epoch': 0.06} 2%|▏ | 282/13317 [02:54<2:13:47, 1.62it/s] 2%|▏ | 283/13317 [02:54<2:13:37, 1.63it/s] {'loss': 0.3826, 'grad_norm': 0.7420628666877747, 'learning_rate': 2.124624624624625e-06, 'epoch': 0.06} 2%|▏ | 283/13317 [02:54<2:13:37, 1.63it/s] 2%|▏ | 284/13317 [02:55<2:13:29, 1.63it/s] {'loss': 0.4964, 'grad_norm': 0.8750054836273193, 'learning_rate': 2.1321321321321325e-06, 'epoch': 0.06} 2%|▏ | 284/13317 [02:55<2:13:29, 1.63it/s] 2%|▏ | 285/13317 [02:55<2:13:33, 1.63it/s] {'loss': 0.459, 'grad_norm': 0.774254322052002, 'learning_rate': 2.13963963963964e-06, 'epoch': 0.06} 2%|▏ | 285/13317 [02:55<2:13:33, 1.63it/s] 2%|▏ | 286/13317 [02:56<2:13:27, 1.63it/s] {'loss': 0.5433, 'grad_norm': 0.8343825936317444, 'learning_rate': 2.147147147147147e-06, 'epoch': 0.06} 2%|▏ | 286/13317 [02:56<2:13:27, 1.63it/s] 2%|▏ | 287/13317 [02:57<2:13:21, 1.63it/s] {'loss': 0.4295, 'grad_norm': 0.8560160398483276, 'learning_rate': 2.1546546546546548e-06, 'epoch': 0.06} 2%|▏ | 287/13317 [02:57<2:13:21, 1.63it/s] 2%|▏ | 288/13317 [02:57<2:13:16, 1.63it/s] {'loss': 0.4836, 'grad_norm': 0.8438105583190918, 'learning_rate': 2.1621621621621623e-06, 'epoch': 0.06} 2%|▏ | 288/13317 [02:57<2:13:16, 1.63it/s] 2%|▏ | 289/13317 [02:58<2:13:13, 1.63it/s] {'loss': 0.4323, 'grad_norm': 0.7245703339576721, 'learning_rate': 2.16966966966967e-06, 'epoch': 0.07} 2%|▏ | 289/13317 [02:58<2:13:13, 1.63it/s] 2%|▏ | 290/13317 [02:58<2:13:22, 1.63it/s] {'loss': 0.4935, 'grad_norm': 0.7448390126228333, 'learning_rate': 2.1771771771771775e-06, 'epoch': 0.07} 2%|▏ | 290/13317 [02:58<2:13:22, 1.63it/s] 2%|▏ | 291/13317 [02:59<2:13:17, 1.63it/s] {'loss': 0.487, 'grad_norm': 0.852615237236023, 'learning_rate': 2.1846846846846846e-06, 'epoch': 0.07} 2%|▏ | 291/13317 [02:59<2:13:17, 1.63it/s] 2%|▏ | 292/13317 [03:00<2:13:31, 1.63it/s] {'loss': 0.4669, 'grad_norm': 0.7476748824119568, 'learning_rate': 2.192192192192192e-06, 'epoch': 0.07} 2%|▏ | 292/13317 [03:00<2:13:31, 1.63it/s] 2%|▏ | 293/13317 [03:00<2:13:25, 1.63it/s] {'loss': 0.5296, 'grad_norm': 0.8050178289413452, 'learning_rate': 2.1996996996996998e-06, 'epoch': 0.07} 2%|▏ | 293/13317 [03:00<2:13:25, 1.63it/s] 2%|▏ | 294/13317 [03:01<2:13:21, 1.63it/s] {'loss': 0.4961, 'grad_norm': 0.7921327352523804, 'learning_rate': 2.2072072072072073e-06, 'epoch': 0.07} 2%|▏ | 294/13317 [03:01<2:13:21, 1.63it/s] 2%|▏ | 295/13317 [03:01<2:13:37, 1.62it/s] {'loss': 0.4574, 'grad_norm': 0.8301498293876648, 'learning_rate': 2.214714714714715e-06, 'epoch': 0.07} 2%|▏ | 295/13317 [03:02<2:13:37, 1.62it/s] 2%|▏ | 296/13317 [03:02<2:13:25, 1.63it/s] {'loss': 0.5313, 'grad_norm': 0.8647900819778442, 'learning_rate': 2.222222222222222e-06, 'epoch': 0.07} 2%|▏ | 296/13317 [03:02<2:13:25, 1.63it/s] 2%|▏ | 297/13317 [03:03<2:13:26, 1.63it/s] {'loss': 0.5055, 'grad_norm': 1.0204482078552246, 'learning_rate': 2.22972972972973e-06, 'epoch': 0.07} 2%|▏ | 297/13317 [03:03<2:13:26, 1.63it/s] 2%|▏ | 298/13317 [03:03<2:13:15, 1.63it/s] {'loss': 0.4443, 'grad_norm': 0.8799636363983154, 'learning_rate': 2.2372372372372376e-06, 'epoch': 0.07} 2%|▏ | 298/13317 [03:03<2:13:15, 1.63it/s] 2%|▏ | 299/13317 [03:04<2:13:11, 1.63it/s] {'loss': 0.5365, 'grad_norm': 0.8481038212776184, 'learning_rate': 2.244744744744745e-06, 'epoch': 0.07} 2%|▏ | 299/13317 [03:04<2:13:11, 1.63it/s] 2%|▏ | 300/13317 [03:05<2:13:21, 1.63it/s] {'loss': 0.377, 'grad_norm': 0.7336224913597107, 'learning_rate': 2.2522522522522524e-06, 'epoch': 0.07} 2%|▏ | 300/13317 [03:05<2:13:21, 1.63it/s] 2%|▏ | 301/13317 [03:05<2:13:19, 1.63it/s] {'loss': 0.5005, 'grad_norm': 0.7111504673957825, 'learning_rate': 2.25975975975976e-06, 'epoch': 0.07} 2%|▏ | 301/13317 [03:05<2:13:19, 1.63it/s] 2%|▏ | 302/13317 [03:06<2:13:11, 1.63it/s] {'loss': 0.4158, 'grad_norm': 0.7109354138374329, 'learning_rate': 2.2672672672672675e-06, 'epoch': 0.07} 2%|▏ | 302/13317 [03:06<2:13:11, 1.63it/s] 2%|▏ | 303/13317 [03:06<2:13:09, 1.63it/s] {'loss': 0.4644, 'grad_norm': 0.7365208268165588, 'learning_rate': 2.274774774774775e-06, 'epoch': 0.07} 2%|▏ | 303/13317 [03:06<2:13:09, 1.63it/s] 2%|▏ | 304/13317 [03:07<2:13:07, 1.63it/s] {'loss': 0.4992, 'grad_norm': 0.8309431672096252, 'learning_rate': 2.2822822822822822e-06, 'epoch': 0.07} 2%|▏ | 304/13317 [03:07<2:13:07, 1.63it/s] 2%|▏ | 305/13317 [03:08<2:13:17, 1.63it/s] {'loss': 0.5318, 'grad_norm': 0.852641761302948, 'learning_rate': 2.28978978978979e-06, 'epoch': 0.07} 2%|▏ | 305/13317 [03:08<2:13:17, 1.63it/s] 2%|▏ | 306/13317 [03:08<2:13:11, 1.63it/s] {'loss': 0.4677, 'grad_norm': 0.7747059464454651, 'learning_rate': 2.297297297297298e-06, 'epoch': 0.07} 2%|▏ | 306/13317 [03:08<2:13:11, 1.63it/s] 2%|▏ | 307/13317 [03:09<2:13:04, 1.63it/s] {'loss': 0.5994, 'grad_norm': 0.8614656329154968, 'learning_rate': 2.304804804804805e-06, 'epoch': 0.07} 2%|▏ | 307/13317 [03:09<2:13:04, 1.63it/s] 2%|▏ | 308/13317 [03:09<2:13:01, 1.63it/s] {'loss': 0.5473, 'grad_norm': 0.8134952187538147, 'learning_rate': 2.3123123123123125e-06, 'epoch': 0.07} 2%|▏ | 308/13317 [03:10<2:13:01, 1.63it/s] 2%|▏ | 309/13317 [03:10<2:13:02, 1.63it/s] {'loss': 0.65, 'grad_norm': 0.893435001373291, 'learning_rate': 2.31981981981982e-06, 'epoch': 0.07} 2%|▏ | 309/13317 [03:10<2:13:02, 1.63it/s] 2%|▏ | 310/13317 [03:11<2:13:15, 1.63it/s] {'loss': 0.4952, 'grad_norm': 0.8762337565422058, 'learning_rate': 2.3273273273273277e-06, 'epoch': 0.07} 2%|▏ | 310/13317 [03:11<2:13:15, 1.63it/s] 2%|▏ | 311/13317 [03:11<2:13:09, 1.63it/s] {'loss': 0.474, 'grad_norm': 0.7566710710525513, 'learning_rate': 2.3348348348348353e-06, 'epoch': 0.07} 2%|▏ | 311/13317 [03:11<2:13:09, 1.63it/s] 2%|▏ | 312/13317 [03:12<2:13:01, 1.63it/s] {'loss': 0.3861, 'grad_norm': 0.64943528175354, 'learning_rate': 2.3423423423423424e-06, 'epoch': 0.07} 2%|▏ | 312/13317 [03:12<2:13:01, 1.63it/s] 2%|▏ | 313/13317 [03:12<2:12:59, 1.63it/s] {'loss': 0.5086, 'grad_norm': 0.7462379932403564, 'learning_rate': 2.34984984984985e-06, 'epoch': 0.07} 2%|▏ | 313/13317 [03:13<2:12:59, 1.63it/s] 2%|▏ | 314/13317 [03:13<2:13:03, 1.63it/s] {'loss': 0.4086, 'grad_norm': 0.701115608215332, 'learning_rate': 2.3573573573573576e-06, 'epoch': 0.07} 2%|▏ | 314/13317 [03:13<2:13:03, 1.63it/s] 2%|▏ | 315/13317 [03:14<2:13:15, 1.63it/s] {'loss': 0.4928, 'grad_norm': 0.816949725151062, 'learning_rate': 2.364864864864865e-06, 'epoch': 0.07} 2%|▏ | 315/13317 [03:14<2:13:15, 1.63it/s] 2%|▏ | 316/13317 [03:14<2:13:11, 1.63it/s] {'loss': 0.4355, 'grad_norm': 0.8386324644088745, 'learning_rate': 2.3723723723723727e-06, 'epoch': 0.07} 2%|▏ | 316/13317 [03:14<2:13:11, 1.63it/s] 2%|▏ | 317/13317 [03:15<2:13:04, 1.63it/s] {'loss': 0.541, 'grad_norm': 0.7953378558158875, 'learning_rate': 2.37987987987988e-06, 'epoch': 0.07} 2%|▏ | 317/13317 [03:15<2:13:04, 1.63it/s] 2%|▏ | 318/13317 [03:16<2:13:01, 1.63it/s] {'loss': 0.4309, 'grad_norm': 0.7725192904472351, 'learning_rate': 2.3873873873873874e-06, 'epoch': 0.07} 2%|▏ | 318/13317 [03:16<2:13:01, 1.63it/s] 2%|▏ | 319/13317 [03:16<2:13:02, 1.63it/s] {'loss': 0.5655, 'grad_norm': 0.8880619406700134, 'learning_rate': 2.394894894894895e-06, 'epoch': 0.07} 2%|▏ | 319/13317 [03:16<2:13:02, 1.63it/s] 2%|▏ | 320/13317 [03:17<2:13:13, 1.63it/s] {'loss': 0.5018, 'grad_norm': 0.7653263211250305, 'learning_rate': 2.4024024024024026e-06, 'epoch': 0.07} 2%|▏ | 320/13317 [03:17<2:13:13, 1.63it/s] 2%|▏ | 321/13317 [03:17<2:13:06, 1.63it/s] {'loss': 0.535, 'grad_norm': 0.9908320903778076, 'learning_rate': 2.40990990990991e-06, 'epoch': 0.07} 2%|▏ | 321/13317 [03:18<2:13:06, 1.63it/s] 2%|▏ | 322/13317 [03:18<2:13:05, 1.63it/s] {'loss': 0.4911, 'grad_norm': 0.8123887181282043, 'learning_rate': 2.4174174174174177e-06, 'epoch': 0.07} 2%|▏ | 322/13317 [03:18<2:13:05, 1.63it/s] 2%|▏ | 323/13317 [03:19<2:13:00, 1.63it/s] {'loss': 0.4508, 'grad_norm': 0.7047619223594666, 'learning_rate': 2.4249249249249253e-06, 'epoch': 0.07} 2%|▏ | 323/13317 [03:19<2:13:00, 1.63it/s] 2%|▏ | 324/13317 [03:19<2:12:54, 1.63it/s] {'loss': 0.4737, 'grad_norm': 0.7331838607788086, 'learning_rate': 2.432432432432433e-06, 'epoch': 0.07} 2%|▏ | 324/13317 [03:19<2:12:54, 1.63it/s] 2%|▏ | 325/13317 [03:20<2:13:09, 1.63it/s] {'loss': 0.4795, 'grad_norm': 0.8097270131111145, 'learning_rate': 2.43993993993994e-06, 'epoch': 0.07} 2%|▏ | 325/13317 [03:20<2:13:09, 1.63it/s] 2%|▏ | 326/13317 [03:20<2:13:03, 1.63it/s] {'loss': 0.5617, 'grad_norm': 0.9416517019271851, 'learning_rate': 2.4474474474474476e-06, 'epoch': 0.07} 2%|▏ | 326/13317 [03:21<2:13:03, 1.63it/s] 2%|▏ | 327/13317 [03:21<2:12:55, 1.63it/s] {'loss': 0.4613, 'grad_norm': 0.7812457084655762, 'learning_rate': 2.454954954954955e-06, 'epoch': 0.07} 2%|▏ | 327/13317 [03:21<2:12:55, 1.63it/s] 2%|▏ | 328/13317 [03:22<2:12:50, 1.63it/s] {'loss': 0.4474, 'grad_norm': 0.7613444924354553, 'learning_rate': 2.4624624624624628e-06, 'epoch': 0.07} 2%|▏ | 328/13317 [03:22<2:12:50, 1.63it/s] 2%|▏ | 329/13317 [03:22<2:12:51, 1.63it/s] {'loss': 0.4647, 'grad_norm': 0.6596733927726746, 'learning_rate': 2.4699699699699703e-06, 'epoch': 0.07} 2%|▏ | 329/13317 [03:22<2:12:51, 1.63it/s] 2%|▏ | 330/13317 [03:23<2:12:59, 1.63it/s] {'loss': 0.492, 'grad_norm': 0.7416242957115173, 'learning_rate': 2.4774774774774775e-06, 'epoch': 0.07} 2%|▏ | 330/13317 [03:23<2:12:59, 1.63it/s] 2%|▏ | 331/13317 [03:24<2:12:55, 1.63it/s] {'loss': 0.4028, 'grad_norm': 0.6864984035491943, 'learning_rate': 2.484984984984985e-06, 'epoch': 0.07} 2%|▏ | 331/13317 [03:24<2:12:55, 1.63it/s] 2%|▏ | 332/13317 [03:24<2:19:49, 1.55it/s] {'loss': 0.5702, 'grad_norm': 0.8038687705993652, 'learning_rate': 2.4924924924924926e-06, 'epoch': 0.07} 2%|▏ | 332/13317 [03:24<2:19:49, 1.55it/s] 3%|▎ | 333/13317 [03:25<2:17:38, 1.57it/s] {'loss': 0.4988, 'grad_norm': 0.8439292311668396, 'learning_rate': 2.5e-06, 'epoch': 0.08} 3%|▎ | 333/13317 [03:25<2:17:38, 1.57it/s] 3%|▎ | 334/13317 [03:25<2:16:10, 1.59it/s] {'loss': 0.4337, 'grad_norm': 0.733538806438446, 'learning_rate': 2.5075075075075078e-06, 'epoch': 0.08} 3%|▎ | 334/13317 [03:26<2:16:10, 1.59it/s] 3%|▎ | 335/13317 [03:26<2:15:17, 1.60it/s] {'loss': 0.5706, 'grad_norm': 0.8551611304283142, 'learning_rate': 2.5150150150150154e-06, 'epoch': 0.08} 3%|▎ | 335/13317 [03:26<2:15:17, 1.60it/s] 3%|▎ | 336/13317 [03:27<2:14:28, 1.61it/s] {'loss': 0.5388, 'grad_norm': 0.9304776787757874, 'learning_rate': 2.5225225225225225e-06, 'epoch': 0.08} 3%|▎ | 336/13317 [03:27<2:14:28, 1.61it/s] 3%|▎ | 337/13317 [03:27<2:13:55, 1.62it/s] {'loss': 0.4888, 'grad_norm': 0.8561441898345947, 'learning_rate': 2.53003003003003e-06, 'epoch': 0.08} 3%|▎ | 337/13317 [03:27<2:13:55, 1.62it/s] 3%|▎ | 338/13317 [03:28<2:13:32, 1.62it/s] {'loss': 0.458, 'grad_norm': 0.7742632031440735, 'learning_rate': 2.5375375375375377e-06, 'epoch': 0.08} 3%|▎ | 338/13317 [03:28<2:13:32, 1.62it/s] 3%|▎ | 339/13317 [03:29<2:13:20, 1.62it/s] {'loss': 0.4706, 'grad_norm': 0.8861637115478516, 'learning_rate': 2.5450450450450452e-06, 'epoch': 0.08} 3%|▎ | 339/13317 [03:29<2:13:20, 1.62it/s] 3%|▎ | 340/13317 [03:29<2:13:05, 1.62it/s] {'loss': 0.4587, 'grad_norm': 0.9356284737586975, 'learning_rate': 2.552552552552553e-06, 'epoch': 0.08} 3%|▎ | 340/13317 [03:29<2:13:05, 1.62it/s] 3%|▎ | 341/13317 [03:30<2:13:01, 1.63it/s] {'loss': 0.4935, 'grad_norm': 0.7564237117767334, 'learning_rate': 2.56006006006006e-06, 'epoch': 0.08} 3%|▎ | 341/13317 [03:30<2:13:01, 1.63it/s] 3%|▎ | 342/13317 [03:30<2:12:53, 1.63it/s] {'loss': 0.6443, 'grad_norm': 0.7717772126197815, 'learning_rate': 2.5675675675675675e-06, 'epoch': 0.08} 3%|▎ | 342/13317 [03:31<2:12:53, 1.63it/s] 3%|▎ | 343/13317 [03:31<2:12:46, 1.63it/s] {'loss': 0.406, 'grad_norm': 0.6767715811729431, 'learning_rate': 2.575075075075075e-06, 'epoch': 0.08} 3%|▎ | 343/13317 [03:31<2:12:46, 1.63it/s] 3%|▎ | 344/13317 [03:32<2:13:07, 1.62it/s] {'loss': 0.5462, 'grad_norm': 0.7747556567192078, 'learning_rate': 2.5825825825825827e-06, 'epoch': 0.08} 3%|▎ | 344/13317 [03:32<2:13:07, 1.62it/s] 3%|▎ | 345/13317 [03:32<2:13:00, 1.63it/s] {'loss': 0.3798, 'grad_norm': 0.6473003625869751, 'learning_rate': 2.5900900900900907e-06, 'epoch': 0.08} 3%|▎ | 345/13317 [03:32<2:13:00, 1.63it/s] 3%|▎ | 346/13317 [03:33<2:12:58, 1.63it/s] {'loss': 0.5846, 'grad_norm': 0.8775733709335327, 'learning_rate': 2.5975975975975974e-06, 'epoch': 0.08} 3%|▎ | 346/13317 [03:33<2:12:58, 1.63it/s] 3%|▎ | 347/13317 [03:33<2:12:50, 1.63it/s] {'loss': 0.4178, 'grad_norm': 0.7576457262039185, 'learning_rate': 2.605105105105105e-06, 'epoch': 0.08} 3%|▎ | 347/13317 [03:34<2:12:50, 1.63it/s] 3%|▎ | 348/13317 [03:34<2:12:48, 1.63it/s] {'loss': 0.5195, 'grad_norm': 0.7039090394973755, 'learning_rate': 2.612612612612613e-06, 'epoch': 0.08} 3%|▎ | 348/13317 [03:34<2:12:48, 1.63it/s] 3%|▎ | 349/13317 [03:35<2:12:47, 1.63it/s] {'loss': 0.448, 'grad_norm': 0.7303341031074524, 'learning_rate': 2.6201201201201205e-06, 'epoch': 0.08} 3%|▎ | 349/13317 [03:35<2:12:47, 1.63it/s] 3%|▎ | 350/13317 [03:35<2:12:40, 1.63it/s] {'loss': 0.4704, 'grad_norm': 0.7013117074966431, 'learning_rate': 2.627627627627628e-06, 'epoch': 0.08} 3%|▎ | 350/13317 [03:35<2:12:40, 1.63it/s] 3%|▎ | 351/13317 [03:36<2:12:43, 1.63it/s] {'loss': 0.5373, 'grad_norm': 0.7986212968826294, 'learning_rate': 2.6351351351351353e-06, 'epoch': 0.08} 3%|▎ | 351/13317 [03:36<2:12:43, 1.63it/s] 3%|▎ | 352/13317 [03:37<2:12:40, 1.63it/s] {'loss': 0.5989, 'grad_norm': 0.8149718046188354, 'learning_rate': 2.642642642642643e-06, 'epoch': 0.08} 3%|▎ | 352/13317 [03:37<2:12:40, 1.63it/s] 3%|▎ | 353/13317 [03:37<2:12:48, 1.63it/s] {'loss': 0.5062, 'grad_norm': 0.7915011048316956, 'learning_rate': 2.6501501501501504e-06, 'epoch': 0.08} 3%|▎ | 353/13317 [03:37<2:12:48, 1.63it/s] 3%|▎ | 354/13317 [03:38<2:12:44, 1.63it/s] {'loss': 0.5307, 'grad_norm': 0.7167931795120239, 'learning_rate': 2.657657657657658e-06, 'epoch': 0.08} 3%|▎ | 354/13317 [03:38<2:12:44, 1.63it/s] 3%|▎ | 355/13317 [03:38<2:12:36, 1.63it/s] {'loss': 0.6471, 'grad_norm': 0.8861395716667175, 'learning_rate': 2.6651651651651656e-06, 'epoch': 0.08} 3%|▎ | 355/13317 [03:39<2:12:36, 1.63it/s] 3%|▎ | 356/13317 [03:39<2:12:36, 1.63it/s] {'loss': 0.4904, 'grad_norm': 0.767351508140564, 'learning_rate': 2.672672672672673e-06, 'epoch': 0.08} 3%|▎ | 356/13317 [03:39<2:12:36, 1.63it/s] 3%|▎ | 357/13317 [03:40<2:12:33, 1.63it/s] {'loss': 0.4671, 'grad_norm': 0.8061123490333557, 'learning_rate': 2.6801801801801803e-06, 'epoch': 0.08} 3%|▎ | 357/13317 [03:40<2:12:33, 1.63it/s] 3%|▎ | 358/13317 [03:40<2:12:39, 1.63it/s] {'loss': 0.4055, 'grad_norm': 0.6337212324142456, 'learning_rate': 2.687687687687688e-06, 'epoch': 0.08} 3%|▎ | 358/13317 [03:40<2:12:39, 1.63it/s] 3%|▎ | 359/13317 [03:41<2:12:37, 1.63it/s] {'loss': 0.4451, 'grad_norm': 0.8043761849403381, 'learning_rate': 2.6951951951951954e-06, 'epoch': 0.08} 3%|▎ | 359/13317 [03:41<2:12:37, 1.63it/s] 3%|▎ | 360/13317 [03:41<2:12:41, 1.63it/s] {'loss': 0.3945, 'grad_norm': 0.628926157951355, 'learning_rate': 2.702702702702703e-06, 'epoch': 0.08} 3%|▎ | 360/13317 [03:42<2:12:41, 1.63it/s] 3%|▎ | 361/13317 [03:42<2:20:11, 1.54it/s] {'loss': 0.515, 'grad_norm': 0.8958503007888794, 'learning_rate': 2.7102102102102106e-06, 'epoch': 0.08} 3%|▎ | 361/13317 [03:42<2:20:11, 1.54it/s] 3%|▎ | 362/13317 [03:43<2:17:53, 1.57it/s] {'loss': 0.4603, 'grad_norm': 0.7154403328895569, 'learning_rate': 2.7177177177177177e-06, 'epoch': 0.08} 3%|▎ | 362/13317 [03:43<2:17:53, 1.57it/s] 3%|▎ | 363/13317 [03:43<2:16:28, 1.58it/s] {'loss': 0.5884, 'grad_norm': 0.8347815871238708, 'learning_rate': 2.7252252252252253e-06, 'epoch': 0.08} 3%|▎ | 363/13317 [03:44<2:16:28, 1.58it/s] 3%|▎ | 364/13317 [03:44<2:15:14, 1.60it/s] {'loss': 0.4976, 'grad_norm': 0.8042851686477661, 'learning_rate': 2.732732732732733e-06, 'epoch': 0.08} 3%|▎ | 364/13317 [03:44<2:15:14, 1.60it/s] 3%|▎ | 365/13317 [03:45<2:14:24, 1.61it/s] {'loss': 0.421, 'grad_norm': 0.6979194283485413, 'learning_rate': 2.7402402402402405e-06, 'epoch': 0.08} 3%|▎ | 365/13317 [03:45<2:14:24, 1.61it/s] 3%|▎ | 366/13317 [03:45<2:13:46, 1.61it/s] {'loss': 0.4169, 'grad_norm': 0.6536282896995544, 'learning_rate': 2.747747747747748e-06, 'epoch': 0.08} 3%|▎ | 366/13317 [03:45<2:13:46, 1.61it/s] 3%|▎ | 367/13317 [03:46<2:13:20, 1.62it/s] {'loss': 0.4715, 'grad_norm': 0.6974200010299683, 'learning_rate': 2.755255255255255e-06, 'epoch': 0.08} 3%|▎ | 367/13317 [03:46<2:13:20, 1.62it/s] 3%|▎ | 368/13317 [03:46<2:13:03, 1.62it/s] {'loss': 0.4677, 'grad_norm': 0.6706352233886719, 'learning_rate': 2.7627627627627628e-06, 'epoch': 0.08} 3%|▎ | 368/13317 [03:47<2:13:03, 1.62it/s] 3%|▎ | 369/13317 [03:47<2:12:47, 1.63it/s] {'loss': 0.3203, 'grad_norm': 0.6404407620429993, 'learning_rate': 2.7702702702702703e-06, 'epoch': 0.08} 3%|▎ | 369/13317 [03:47<2:12:47, 1.63it/s] 3%|▎ | 370/13317 [03:48<2:12:37, 1.63it/s] {'loss': 0.3854, 'grad_norm': 0.712901771068573, 'learning_rate': 2.7777777777777783e-06, 'epoch': 0.08} 3%|▎ | 370/13317 [03:48<2:12:37, 1.63it/s] 3%|▎ | 371/13317 [03:48<2:12:39, 1.63it/s] {'loss': 0.4965, 'grad_norm': 0.820353090763092, 'learning_rate': 2.785285285285286e-06, 'epoch': 0.08} 3%|▎ | 371/13317 [03:48<2:12:39, 1.63it/s] 3%|▎ | 372/13317 [03:49<2:12:36, 1.63it/s] {'loss': 0.5023, 'grad_norm': 0.7078438997268677, 'learning_rate': 2.7927927927927926e-06, 'epoch': 0.08} 3%|▎ | 372/13317 [03:49<2:12:36, 1.63it/s] 3%|▎ | 373/13317 [03:50<2:12:30, 1.63it/s] {'loss': 0.3651, 'grad_norm': 0.7108337879180908, 'learning_rate': 2.8003003003003006e-06, 'epoch': 0.08} 3%|▎ | 373/13317 [03:50<2:12:30, 1.63it/s] 3%|▎ | 374/13317 [03:50<2:12:29, 1.63it/s] {'loss': 0.4168, 'grad_norm': 0.6355459094047546, 'learning_rate': 2.807807807807808e-06, 'epoch': 0.08} 3%|▎ | 374/13317 [03:50<2:12:29, 1.63it/s] 3%|▎ | 375/13317 [03:51<2:12:27, 1.63it/s] {'loss': 0.4343, 'grad_norm': 0.7654605507850647, 'learning_rate': 2.8153153153153158e-06, 'epoch': 0.08} 3%|▎ | 375/13317 [03:51<2:12:27, 1.63it/s] 3%|▎ | 376/13317 [03:51<2:12:33, 1.63it/s] {'loss': 0.4584, 'grad_norm': 0.7532657384872437, 'learning_rate': 2.8228228228228234e-06, 'epoch': 0.08} 3%|▎ | 376/13317 [03:52<2:12:33, 1.63it/s] 3%|▎ | 377/13317 [03:52<2:12:23, 1.63it/s] {'loss': 0.4836, 'grad_norm': 0.7189052104949951, 'learning_rate': 2.8303303303303305e-06, 'epoch': 0.08} 3%|▎ | 377/13317 [03:52<2:12:23, 1.63it/s] 3%|▎ | 378/13317 [03:53<2:12:24, 1.63it/s] {'loss': 0.4595, 'grad_norm': 0.8073223829269409, 'learning_rate': 2.837837837837838e-06, 'epoch': 0.09} 3%|▎ | 378/13317 [03:53<2:12:24, 1.63it/s] 3%|▎ | 379/13317 [03:53<2:12:19, 1.63it/s] {'loss': 0.4275, 'grad_norm': 0.742399275302887, 'learning_rate': 2.8453453453453457e-06, 'epoch': 0.09} 3%|▎ | 379/13317 [03:53<2:12:19, 1.63it/s] 3%|▎ | 380/13317 [03:54<2:12:21, 1.63it/s] {'loss': 0.4115, 'grad_norm': 0.9317102432250977, 'learning_rate': 2.8528528528528532e-06, 'epoch': 0.09} 3%|▎ | 380/13317 [03:54<2:12:21, 1.63it/s] 3%|▎ | 381/13317 [03:54<2:12:25, 1.63it/s] {'loss': 0.4723, 'grad_norm': 0.8024287819862366, 'learning_rate': 2.860360360360361e-06, 'epoch': 0.09} 3%|▎ | 381/13317 [03:55<2:12:25, 1.63it/s] 3%|▎ | 382/13317 [03:55<2:12:23, 1.63it/s] {'loss': 0.4736, 'grad_norm': 0.6790726780891418, 'learning_rate': 2.867867867867868e-06, 'epoch': 0.09} 3%|▎ | 382/13317 [03:55<2:12:23, 1.63it/s] 3%|▎ | 383/13317 [03:56<2:12:15, 1.63it/s] {'loss': 0.4309, 'grad_norm': 0.8539626598358154, 'learning_rate': 2.8753753753753755e-06, 'epoch': 0.09} 3%|▎ | 383/13317 [03:56<2:12:15, 1.63it/s] 3%|▎ | 384/13317 [03:56<2:12:19, 1.63it/s] {'loss': 0.3837, 'grad_norm': 0.7407821416854858, 'learning_rate': 2.882882882882883e-06, 'epoch': 0.09} 3%|▎ | 384/13317 [03:56<2:12:19, 1.63it/s] 3%|▎ | 385/13317 [03:57<2:12:15, 1.63it/s] {'loss': 0.4586, 'grad_norm': 0.8013657331466675, 'learning_rate': 2.8903903903903907e-06, 'epoch': 0.09} 3%|▎ | 385/13317 [03:57<2:12:15, 1.63it/s] 3%|▎ | 386/13317 [03:58<2:12:18, 1.63it/s] {'loss': 0.4814, 'grad_norm': 0.6932796239852905, 'learning_rate': 2.8978978978978983e-06, 'epoch': 0.09} 3%|▎ | 386/13317 [03:58<2:12:18, 1.63it/s] 3%|▎ | 387/13317 [03:58<2:12:16, 1.63it/s] {'loss': 0.345, 'grad_norm': 0.7515282034873962, 'learning_rate': 2.9054054054054054e-06, 'epoch': 0.09} 3%|▎ | 387/13317 [03:58<2:12:16, 1.63it/s] 3%|▎ | 388/13317 [03:59<2:12:15, 1.63it/s] {'loss': 0.4296, 'grad_norm': 0.7212568521499634, 'learning_rate': 2.912912912912913e-06, 'epoch': 0.09} 3%|▎ | 388/13317 [03:59<2:12:15, 1.63it/s] 3%|▎ | 389/13317 [03:59<2:12:11, 1.63it/s] {'loss': 0.4345, 'grad_norm': 0.6890455484390259, 'learning_rate': 2.9204204204204206e-06, 'epoch': 0.09} 3%|▎ | 389/13317 [04:00<2:12:11, 1.63it/s] 3%|▎ | 390/13317 [04:00<2:12:11, 1.63it/s] {'loss': 0.3573, 'grad_norm': 0.6826176047325134, 'learning_rate': 2.927927927927928e-06, 'epoch': 0.09} 3%|▎ | 390/13317 [04:00<2:12:11, 1.63it/s] 3%|▎ | 391/13317 [04:01<2:12:18, 1.63it/s] {'loss': 0.4455, 'grad_norm': 0.7777101993560791, 'learning_rate': 2.9354354354354357e-06, 'epoch': 0.09} 3%|▎ | 391/13317 [04:01<2:12:18, 1.63it/s] 3%|▎ | 392/13317 [04:01<2:12:13, 1.63it/s] {'loss': 0.4204, 'grad_norm': 0.6666024923324585, 'learning_rate': 2.942942942942943e-06, 'epoch': 0.09} 3%|▎ | 392/13317 [04:01<2:12:13, 1.63it/s] 3%|▎ | 393/13317 [04:02<2:12:13, 1.63it/s] {'loss': 0.455, 'grad_norm': 0.7357118725776672, 'learning_rate': 2.9504504504504504e-06, 'epoch': 0.09} 3%|▎ | 393/13317 [04:02<2:12:13, 1.63it/s] 3%|▎ | 394/13317 [04:02<2:12:08, 1.63it/s] {'loss': 0.3786, 'grad_norm': 0.6265907883644104, 'learning_rate': 2.957957957957958e-06, 'epoch': 0.09} 3%|▎ | 394/13317 [04:03<2:12:08, 1.63it/s] 3%|▎ | 395/13317 [04:03<2:12:03, 1.63it/s] {'loss': 0.3602, 'grad_norm': 0.6242502927780151, 'learning_rate': 2.9654654654654656e-06, 'epoch': 0.09} 3%|▎ | 395/13317 [04:03<2:12:03, 1.63it/s] 3%|▎ | 396/13317 [04:04<2:12:17, 1.63it/s] {'loss': 0.4138, 'grad_norm': 0.7104613184928894, 'learning_rate': 2.9729729729729736e-06, 'epoch': 0.09} 3%|▎ | 396/13317 [04:04<2:12:17, 1.63it/s] 3%|▎ | 397/13317 [04:04<2:12:11, 1.63it/s] {'loss': 0.4007, 'grad_norm': 0.688345730304718, 'learning_rate': 2.980480480480481e-06, 'epoch': 0.09} 3%|▎ | 397/13317 [04:04<2:12:11, 1.63it/s] 3%|▎ | 398/13317 [04:05<2:12:08, 1.63it/s] {'loss': 0.4401, 'grad_norm': 0.62779700756073, 'learning_rate': 2.987987987987988e-06, 'epoch': 0.09} 3%|▎ | 398/13317 [04:05<2:12:08, 1.63it/s] 3%|▎ | 399/13317 [04:06<2:12:04, 1.63it/s] {'loss': 0.4056, 'grad_norm': 0.7132279276847839, 'learning_rate': 2.995495495495496e-06, 'epoch': 0.09} 3%|▎ | 399/13317 [04:06<2:12:04, 1.63it/s] 3%|▎ | 400/13317 [04:06<2:12:03, 1.63it/s] {'loss': 0.4577, 'grad_norm': 0.6780898571014404, 'learning_rate': 3.0030030030030034e-06, 'epoch': 0.09} 3%|▎ | 400/13317 [04:06<2:12:03, 1.63it/s] 3%|▎ | 401/13317 [04:07<2:12:24, 1.63it/s] {'loss': 0.4987, 'grad_norm': 0.7893285751342773, 'learning_rate': 3.010510510510511e-06, 'epoch': 0.09} 3%|▎ | 401/13317 [04:07<2:12:24, 1.63it/s] 3%|▎ | 402/13317 [04:07<2:12:23, 1.63it/s] {'loss': 0.4325, 'grad_norm': 0.6596247553825378, 'learning_rate': 3.0180180180180186e-06, 'epoch': 0.09} 3%|▎ | 402/13317 [04:07<2:12:23, 1.63it/s] 3%|▎ | 403/13317 [04:08<2:12:20, 1.63it/s] {'loss': 0.3937, 'grad_norm': 0.7547261714935303, 'learning_rate': 3.0255255255255257e-06, 'epoch': 0.09} 3%|▎ | 403/13317 [04:08<2:12:20, 1.63it/s] 3%|▎ | 404/13317 [04:09<2:12:15, 1.63it/s] {'loss': 0.5235, 'grad_norm': 0.7699780464172363, 'learning_rate': 3.0330330330330333e-06, 'epoch': 0.09} 3%|▎ | 404/13317 [04:09<2:12:15, 1.63it/s] 3%|▎ | 405/13317 [04:09<2:12:09, 1.63it/s] {'loss': 0.5291, 'grad_norm': 0.7426941394805908, 'learning_rate': 3.040540540540541e-06, 'epoch': 0.09} 3%|▎ | 405/13317 [04:09<2:12:09, 1.63it/s] 3%|▎ | 406/13317 [04:10<2:12:09, 1.63it/s] {'loss': 0.4079, 'grad_norm': 0.7774872183799744, 'learning_rate': 3.0480480480480485e-06, 'epoch': 0.09} 3%|▎ | 406/13317 [04:10<2:12:09, 1.63it/s] 3%|▎ | 407/13317 [04:10<2:12:05, 1.63it/s] {'loss': 0.4196, 'grad_norm': 0.767114520072937, 'learning_rate': 3.055555555555556e-06, 'epoch': 0.09} 3%|▎ | 407/13317 [04:11<2:12:05, 1.63it/s] 3%|▎ | 408/13317 [04:11<2:12:43, 1.62it/s] {'loss': 0.4518, 'grad_norm': 0.7248908877372742, 'learning_rate': 3.063063063063063e-06, 'epoch': 0.09} 3%|▎ | 408/13317 [04:11<2:12:43, 1.62it/s] 3%|▎ | 409/13317 [04:12<2:12:31, 1.62it/s] {'loss': 0.4427, 'grad_norm': 0.7351310849189758, 'learning_rate': 3.0705705705705708e-06, 'epoch': 0.09} 3%|▎ | 409/13317 [04:12<2:12:31, 1.62it/s] 3%|▎ | 410/13317 [04:12<2:12:20, 1.63it/s] {'loss': 0.4752, 'grad_norm': 0.7025326490402222, 'learning_rate': 3.0780780780780783e-06, 'epoch': 0.09} 3%|▎ | 410/13317 [04:12<2:12:20, 1.63it/s] 3%|▎ | 411/13317 [04:13<2:12:17, 1.63it/s] {'loss': 0.4543, 'grad_norm': 0.6488361358642578, 'learning_rate': 3.085585585585586e-06, 'epoch': 0.09} 3%|▎ | 411/13317 [04:13<2:12:17, 1.63it/s] 3%|▎ | 412/13317 [04:14<2:12:12, 1.63it/s] {'loss': 0.4685, 'grad_norm': 0.6951853632926941, 'learning_rate': 3.0930930930930935e-06, 'epoch': 0.09} 3%|▎ | 412/13317 [04:14<2:12:12, 1.63it/s] 3%|▎ | 413/13317 [04:14<2:12:15, 1.63it/s] {'loss': 0.4958, 'grad_norm': 0.8334890604019165, 'learning_rate': 3.1006006006006006e-06, 'epoch': 0.09} 3%|▎ | 413/13317 [04:14<2:12:15, 1.63it/s] 3%|▎ | 414/13317 [04:15<2:12:06, 1.63it/s] {'loss': 0.3908, 'grad_norm': 0.6970667243003845, 'learning_rate': 3.1081081081081082e-06, 'epoch': 0.09} 3%|▎ | 414/13317 [04:15<2:12:06, 1.63it/s] 3%|▎ | 415/13317 [04:15<2:12:01, 1.63it/s] {'loss': 0.5229, 'grad_norm': 0.7065516710281372, 'learning_rate': 3.1156156156156158e-06, 'epoch': 0.09} 3%|▎ | 415/13317 [04:15<2:12:01, 1.63it/s] 3%|▎ | 416/13317 [04:16<2:12:05, 1.63it/s] {'loss': 0.455, 'grad_norm': 0.7638353705406189, 'learning_rate': 3.1231231231231234e-06, 'epoch': 0.09} 3%|▎ | 416/13317 [04:16<2:12:05, 1.63it/s] 3%|▎ | 417/13317 [04:17<2:12:03, 1.63it/s] {'loss': 0.3615, 'grad_norm': 0.6620998978614807, 'learning_rate': 3.130630630630631e-06, 'epoch': 0.09} 3%|▎ | 417/13317 [04:17<2:12:03, 1.63it/s] 3%|▎ | 418/13317 [04:17<2:12:03, 1.63it/s] {'loss': 0.5016, 'grad_norm': 0.7566660642623901, 'learning_rate': 3.138138138138138e-06, 'epoch': 0.09} 3%|▎ | 418/13317 [04:17<2:12:03, 1.63it/s] 3%|▎ | 419/13317 [04:18<2:11:57, 1.63it/s] {'loss': 0.3935, 'grad_norm': 0.672184407711029, 'learning_rate': 3.1456456456456457e-06, 'epoch': 0.09} 3%|▎ | 419/13317 [04:18<2:11:57, 1.63it/s] 3%|▎ | 420/13317 [04:18<2:11:51, 1.63it/s] {'loss': 0.5348, 'grad_norm': 0.797938346862793, 'learning_rate': 3.1531531531531532e-06, 'epoch': 0.09} 3%|▎ | 420/13317 [04:19<2:11:51, 1.63it/s] 3%|▎ | 421/13317 [04:19<2:12:04, 1.63it/s] {'loss': 0.5303, 'grad_norm': 0.8910072445869446, 'learning_rate': 3.160660660660661e-06, 'epoch': 0.09} 3%|▎ | 421/13317 [04:19<2:12:04, 1.63it/s] 3%|▎ | 422/13317 [04:20<2:11:55, 1.63it/s] {'loss': 0.4015, 'grad_norm': 0.6455042362213135, 'learning_rate': 3.168168168168169e-06, 'epoch': 0.1} 3%|▎ | 422/13317 [04:20<2:11:55, 1.63it/s] 3%|▎ | 423/13317 [04:20<2:11:52, 1.63it/s] {'loss': 0.5236, 'grad_norm': 0.6573291420936584, 'learning_rate': 3.1756756756756755e-06, 'epoch': 0.1} 3%|▎ | 423/13317 [04:20<2:11:52, 1.63it/s] 3%|▎ | 424/13317 [04:21<2:11:49, 1.63it/s] {'loss': 0.4813, 'grad_norm': 0.7061600089073181, 'learning_rate': 3.183183183183183e-06, 'epoch': 0.1} 3%|▎ | 424/13317 [04:21<2:11:49, 1.63it/s] 3%|▎ | 425/13317 [04:21<2:11:48, 1.63it/s] {'loss': 0.4781, 'grad_norm': 0.706832230091095, 'learning_rate': 3.190690690690691e-06, 'epoch': 0.1} 3%|▎ | 425/13317 [04:22<2:11:48, 1.63it/s] 3%|▎ | 426/13317 [04:22<2:11:54, 1.63it/s] {'loss': 0.4983, 'grad_norm': 0.7267263531684875, 'learning_rate': 3.1981981981981987e-06, 'epoch': 0.1} 3%|▎ | 426/13317 [04:22<2:11:54, 1.63it/s] 3%|▎ | 427/13317 [04:23<2:11:51, 1.63it/s] {'loss': 0.4213, 'grad_norm': 0.6855952739715576, 'learning_rate': 3.2057057057057063e-06, 'epoch': 0.1} 3%|▎ | 427/13317 [04:23<2:11:51, 1.63it/s] 3%|▎ | 428/13317 [04:23<2:11:52, 1.63it/s] {'loss': 0.4775, 'grad_norm': 0.6827059388160706, 'learning_rate': 3.2132132132132134e-06, 'epoch': 0.1} 3%|▎ | 428/13317 [04:23<2:11:52, 1.63it/s] 3%|▎ | 429/13317 [04:24<2:11:49, 1.63it/s] {'loss': 0.4742, 'grad_norm': 0.6486400365829468, 'learning_rate': 3.220720720720721e-06, 'epoch': 0.1} 3%|▎ | 429/13317 [04:24<2:11:49, 1.63it/s] 3%|▎ | 430/13317 [04:25<2:11:46, 1.63it/s] {'loss': 0.458, 'grad_norm': 0.7116115689277649, 'learning_rate': 3.2282282282282286e-06, 'epoch': 0.1} 3%|▎ | 430/13317 [04:25<2:11:46, 1.63it/s] 3%|▎ | 431/13317 [04:25<2:11:57, 1.63it/s] {'loss': 0.5099, 'grad_norm': 0.6915836334228516, 'learning_rate': 3.235735735735736e-06, 'epoch': 0.1} 3%|▎ | 431/13317 [04:25<2:11:57, 1.63it/s] 3%|▎ | 432/13317 [04:26<2:11:51, 1.63it/s] {'loss': 0.459, 'grad_norm': 0.7435068488121033, 'learning_rate': 3.2432432432432437e-06, 'epoch': 0.1} 3%|▎ | 432/13317 [04:26<2:11:51, 1.63it/s] 3%|▎ | 433/13317 [04:26<2:11:49, 1.63it/s] {'loss': 0.438, 'grad_norm': 0.6927868127822876, 'learning_rate': 3.250750750750751e-06, 'epoch': 0.1} 3%|▎ | 433/13317 [04:27<2:11:49, 1.63it/s] 3%|▎ | 434/13317 [04:27<2:11:51, 1.63it/s] {'loss': 0.408, 'grad_norm': 0.7656463980674744, 'learning_rate': 3.2582582582582584e-06, 'epoch': 0.1} 3%|▎ | 434/13317 [04:27<2:11:51, 1.63it/s] 3%|▎ | 435/13317 [04:28<2:11:45, 1.63it/s] {'loss': 0.4215, 'grad_norm': 0.6323496103286743, 'learning_rate': 3.265765765765766e-06, 'epoch': 0.1} 3%|▎ | 435/13317 [04:28<2:11:45, 1.63it/s] 3%|▎ | 436/13317 [04:28<2:11:53, 1.63it/s] {'loss': 0.4551, 'grad_norm': 0.7722992300987244, 'learning_rate': 3.2732732732732736e-06, 'epoch': 0.1} 3%|▎ | 436/13317 [04:28<2:11:53, 1.63it/s] 3%|▎ | 437/13317 [04:29<2:11:49, 1.63it/s] {'loss': 0.4665, 'grad_norm': 0.8740445375442505, 'learning_rate': 3.280780780780781e-06, 'epoch': 0.1} 3%|▎ | 437/13317 [04:29<2:11:49, 1.63it/s] 3%|▎ | 438/13317 [04:29<2:11:44, 1.63it/s] {'loss': 0.3806, 'grad_norm': 0.579095184803009, 'learning_rate': 3.2882882882882887e-06, 'epoch': 0.1} 3%|▎ | 438/13317 [04:30<2:11:44, 1.63it/s] 3%|▎ | 439/13317 [04:30<2:11:45, 1.63it/s] {'loss': 0.4427, 'grad_norm': 0.6989012360572815, 'learning_rate': 3.295795795795796e-06, 'epoch': 0.1} 3%|▎ | 439/13317 [04:30<2:11:45, 1.63it/s] 3%|▎ | 440/13317 [04:31<2:11:40, 1.63it/s] {'loss': 0.4532, 'grad_norm': 0.6261010766029358, 'learning_rate': 3.3033033033033035e-06, 'epoch': 0.1} 3%|▎ | 440/13317 [04:31<2:11:40, 1.63it/s] 3%|▎ | 441/13317 [04:31<2:11:46, 1.63it/s] {'loss': 0.4531, 'grad_norm': 0.8126707673072815, 'learning_rate': 3.310810810810811e-06, 'epoch': 0.1} 3%|▎ | 441/13317 [04:31<2:11:46, 1.63it/s] 3%|▎ | 442/13317 [04:32<2:11:39, 1.63it/s] {'loss': 0.4213, 'grad_norm': 0.762205183506012, 'learning_rate': 3.3183183183183186e-06, 'epoch': 0.1} 3%|▎ | 442/13317 [04:32<2:11:39, 1.63it/s] 3%|▎ | 443/13317 [04:33<2:11:40, 1.63it/s] {'loss': 0.4377, 'grad_norm': 0.6316590905189514, 'learning_rate': 3.325825825825826e-06, 'epoch': 0.1} 3%|▎ | 443/13317 [04:33<2:11:40, 1.63it/s] 3%|▎ | 444/13317 [04:33<2:11:43, 1.63it/s] {'loss': 0.4483, 'grad_norm': 0.7545338869094849, 'learning_rate': 3.3333333333333333e-06, 'epoch': 0.1} 3%|▎ | 444/13317 [04:33<2:11:43, 1.63it/s] 3%|▎ | 445/13317 [04:34<2:11:38, 1.63it/s] {'loss': 0.4915, 'grad_norm': 0.7908391356468201, 'learning_rate': 3.340840840840841e-06, 'epoch': 0.1} 3%|▎ | 445/13317 [04:34<2:11:38, 1.63it/s] 3%|▎ | 446/13317 [04:34<2:11:55, 1.63it/s] {'loss': 0.6086, 'grad_norm': 0.7524520754814148, 'learning_rate': 3.3483483483483485e-06, 'epoch': 0.1} 3%|▎ | 446/13317 [04:35<2:11:55, 1.63it/s] 3%|▎ | 447/13317 [04:35<2:11:46, 1.63it/s] {'loss': 0.4132, 'grad_norm': 0.6261481046676636, 'learning_rate': 3.3558558558558565e-06, 'epoch': 0.1} 3%|▎ | 447/13317 [04:35<2:11:46, 1.63it/s] 3%|▎ | 448/13317 [04:36<2:11:41, 1.63it/s] {'loss': 0.4198, 'grad_norm': 0.7558276653289795, 'learning_rate': 3.363363363363364e-06, 'epoch': 0.1} 3%|▎ | 448/13317 [04:36<2:11:41, 1.63it/s] 3%|▎ | 449/13317 [04:36<2:11:43, 1.63it/s] {'loss': 0.421, 'grad_norm': 0.6811279654502869, 'learning_rate': 3.3708708708708708e-06, 'epoch': 0.1} 3%|▎ | 449/13317 [04:36<2:11:43, 1.63it/s] 3%|▎ | 450/13317 [04:37<2:11:40, 1.63it/s] {'loss': 0.4403, 'grad_norm': 0.704143226146698, 'learning_rate': 3.3783783783783788e-06, 'epoch': 0.1} 3%|▎ | 450/13317 [04:37<2:11:40, 1.63it/s] 3%|▎ | 451/13317 [04:37<2:11:48, 1.63it/s] {'loss': 0.3303, 'grad_norm': 0.6244333982467651, 'learning_rate': 3.3858858858858863e-06, 'epoch': 0.1} 3%|▎ | 451/13317 [04:38<2:11:48, 1.63it/s] 3%|▎ | 452/13317 [04:38<2:11:42, 1.63it/s] {'loss': 0.5078, 'grad_norm': 0.6552481055259705, 'learning_rate': 3.393393393393394e-06, 'epoch': 0.1} 3%|▎ | 452/13317 [04:38<2:11:42, 1.63it/s] 3%|▎ | 453/13317 [04:39<2:11:37, 1.63it/s] {'loss': 0.4777, 'grad_norm': 0.723028302192688, 'learning_rate': 3.4009009009009015e-06, 'epoch': 0.1} 3%|▎ | 453/13317 [04:39<2:11:37, 1.63it/s] 3%|▎ | 454/13317 [04:39<2:11:34, 1.63it/s] {'loss': 0.4813, 'grad_norm': 0.6408616304397583, 'learning_rate': 3.4084084084084086e-06, 'epoch': 0.1} 3%|▎ | 454/13317 [04:39<2:11:34, 1.63it/s] 3%|▎ | 455/13317 [04:40<2:11:31, 1.63it/s] {'loss': 0.4815, 'grad_norm': 0.7385659217834473, 'learning_rate': 3.4159159159159162e-06, 'epoch': 0.1} 3%|▎ | 455/13317 [04:40<2:11:31, 1.63it/s] 3%|▎ | 456/13317 [04:41<2:11:38, 1.63it/s] {'loss': 0.4326, 'grad_norm': 0.6278891563415527, 'learning_rate': 3.423423423423424e-06, 'epoch': 0.1} 3%|▎ | 456/13317 [04:41<2:11:38, 1.63it/s] 3%|▎ | 457/13317 [04:41<2:11:35, 1.63it/s] {'loss': 0.3893, 'grad_norm': 0.5875856280326843, 'learning_rate': 3.4309309309309314e-06, 'epoch': 0.1} 3%|▎ | 457/13317 [04:41<2:11:35, 1.63it/s] 3%|▎ | 458/13317 [04:42<2:11:36, 1.63it/s] {'loss': 0.3643, 'grad_norm': 0.67644864320755, 'learning_rate': 3.438438438438439e-06, 'epoch': 0.1} 3%|▎ | 458/13317 [04:42<2:11:36, 1.63it/s] 3%|▎ | 459/13317 [04:42<2:11:33, 1.63it/s] {'loss': 0.3107, 'grad_norm': 0.5073651671409607, 'learning_rate': 3.445945945945946e-06, 'epoch': 0.1} 3%|▎ | 459/13317 [04:43<2:11:33, 1.63it/s] 3%|▎ | 460/13317 [04:43<2:11:32, 1.63it/s] {'loss': 0.541, 'grad_norm': 0.81984943151474, 'learning_rate': 3.4534534534534537e-06, 'epoch': 0.1} 3%|▎ | 460/13317 [04:43<2:11:32, 1.63it/s] 3%|▎ | 461/13317 [04:44<2:11:43, 1.63it/s] {'loss': 0.5197, 'grad_norm': 0.8332152366638184, 'learning_rate': 3.4609609609609612e-06, 'epoch': 0.1} 3%|▎ | 461/13317 [04:44<2:11:43, 1.63it/s] 3%|▎ | 462/13317 [04:44<2:11:43, 1.63it/s] {'loss': 0.5032, 'grad_norm': 0.7231094837188721, 'learning_rate': 3.468468468468469e-06, 'epoch': 0.1} 3%|▎ | 462/13317 [04:44<2:11:43, 1.63it/s] 3%|▎ | 463/13317 [04:45<2:11:36, 1.63it/s] {'loss': 0.4249, 'grad_norm': 0.6658948659896851, 'learning_rate': 3.4759759759759764e-06, 'epoch': 0.1} 3%|▎ | 463/13317 [04:45<2:11:36, 1.63it/s] 3%|▎ | 464/13317 [04:45<2:11:32, 1.63it/s] {'loss': 0.5033, 'grad_norm': 0.7102115750312805, 'learning_rate': 3.4834834834834835e-06, 'epoch': 0.1} 3%|▎ | 464/13317 [04:46<2:11:32, 1.63it/s] 3%|▎ | 465/13317 [04:46<2:11:32, 1.63it/s] {'loss': 0.512, 'grad_norm': 0.8554460406303406, 'learning_rate': 3.490990990990991e-06, 'epoch': 0.1} 3%|▎ | 465/13317 [04:46<2:11:32, 1.63it/s] 3%|▎ | 466/13317 [04:47<2:11:36, 1.63it/s] {'loss': 0.4825, 'grad_norm': 0.6698629856109619, 'learning_rate': 3.4984984984984987e-06, 'epoch': 0.1} 3%|▎ | 466/13317 [04:47<2:11:36, 1.63it/s] 4%|▎ | 467/13317 [04:47<2:11:31, 1.63it/s] {'loss': 0.5179, 'grad_norm': 0.8527231216430664, 'learning_rate': 3.5060060060060063e-06, 'epoch': 0.11} 4%|▎ | 467/13317 [04:47<2:11:31, 1.63it/s] 4%|▎ | 468/13317 [04:48<2:11:29, 1.63it/s] {'loss': 0.5558, 'grad_norm': 0.8860906958580017, 'learning_rate': 3.513513513513514e-06, 'epoch': 0.11} 4%|▎ | 468/13317 [04:48<2:11:29, 1.63it/s] 4%|▎ | 469/13317 [04:49<2:11:25, 1.63it/s] {'loss': 0.4335, 'grad_norm': 0.7079217433929443, 'learning_rate': 3.521021021021021e-06, 'epoch': 0.11} 4%|▎ | 469/13317 [04:49<2:11:25, 1.63it/s] 4%|▎ | 470/13317 [04:49<2:11:25, 1.63it/s] {'loss': 0.5229, 'grad_norm': 0.7085310220718384, 'learning_rate': 3.5285285285285286e-06, 'epoch': 0.11} 4%|▎ | 470/13317 [04:49<2:11:25, 1.63it/s] 4%|▎ | 471/13317 [04:50<2:11:28, 1.63it/s] {'loss': 0.4051, 'grad_norm': 0.765766441822052, 'learning_rate': 3.536036036036036e-06, 'epoch': 0.11} 4%|▎ | 471/13317 [04:50<2:11:28, 1.63it/s] 4%|▎ | 472/13317 [04:50<2:11:23, 1.63it/s] {'loss': 0.4148, 'grad_norm': 0.7836328148841858, 'learning_rate': 3.5435435435435437e-06, 'epoch': 0.11} 4%|▎ | 472/13317 [04:50<2:11:23, 1.63it/s] 4%|▎ | 473/13317 [04:51<2:11:20, 1.63it/s] {'loss': 0.4018, 'grad_norm': 0.6780300140380859, 'learning_rate': 3.5510510510510517e-06, 'epoch': 0.11} 4%|▎ | 473/13317 [04:51<2:11:20, 1.63it/s] 4%|▎ | 474/13317 [04:52<2:11:25, 1.63it/s] {'loss': 0.4146, 'grad_norm': 0.6662124991416931, 'learning_rate': 3.5585585585585584e-06, 'epoch': 0.11} 4%|▎ | 474/13317 [04:52<2:11:25, 1.63it/s] 4%|▎ | 475/13317 [04:52<2:11:21, 1.63it/s] {'loss': 0.3877, 'grad_norm': 0.6870988607406616, 'learning_rate': 3.566066066066066e-06, 'epoch': 0.11} 4%|▎ | 475/13317 [04:52<2:11:21, 1.63it/s] 4%|▎ | 476/13317 [04:53<2:11:26, 1.63it/s] {'loss': 0.3708, 'grad_norm': 0.6329861879348755, 'learning_rate': 3.573573573573574e-06, 'epoch': 0.11} 4%|▎ | 476/13317 [04:53<2:11:26, 1.63it/s] 4%|▎ | 477/13317 [04:53<2:11:24, 1.63it/s] {'loss': 0.419, 'grad_norm': 0.7007266879081726, 'learning_rate': 3.5810810810810816e-06, 'epoch': 0.11} 4%|▎ | 477/13317 [04:54<2:11:24, 1.63it/s] 4%|▎ | 478/13317 [04:54<2:11:24, 1.63it/s] {'loss': 0.466, 'grad_norm': 0.7247040271759033, 'learning_rate': 3.588588588588589e-06, 'epoch': 0.11} 4%|▎ | 478/13317 [04:54<2:11:24, 1.63it/s] 4%|▎ | 479/13317 [04:55<2:11:16, 1.63it/s] {'loss': 0.4634, 'grad_norm': 0.7045004367828369, 'learning_rate': 3.5960960960960967e-06, 'epoch': 0.11} 4%|▎ | 479/13317 [04:55<2:11:16, 1.63it/s] 4%|▎ | 480/13317 [04:55<2:11:17, 1.63it/s] {'loss': 0.4946, 'grad_norm': 0.793982982635498, 'learning_rate': 3.603603603603604e-06, 'epoch': 0.11} 4%|▎ | 480/13317 [04:55<2:11:17, 1.63it/s] 4%|▎ | 481/13317 [04:56<2:11:25, 1.63it/s] {'loss': 0.3892, 'grad_norm': 0.6738253235816956, 'learning_rate': 3.6111111111111115e-06, 'epoch': 0.11} 4%|▎ | 481/13317 [04:56<2:11:25, 1.63it/s] 4%|▎ | 482/13317 [04:56<2:11:22, 1.63it/s] {'loss': 0.3949, 'grad_norm': 0.6139069199562073, 'learning_rate': 3.618618618618619e-06, 'epoch': 0.11} 4%|▎ | 482/13317 [04:57<2:11:22, 1.63it/s] 4%|▎ | 483/13317 [04:57<2:11:17, 1.63it/s] {'loss': 0.4763, 'grad_norm': 0.8812891244888306, 'learning_rate': 3.6261261261261266e-06, 'epoch': 0.11} 4%|▎ | 483/13317 [04:57<2:11:17, 1.63it/s] 4%|▎ | 484/13317 [04:58<2:11:16, 1.63it/s] {'loss': 0.4176, 'grad_norm': 0.658255934715271, 'learning_rate': 3.633633633633634e-06, 'epoch': 0.11} 4%|▎ | 484/13317 [04:58<2:11:16, 1.63it/s] 4%|▎ | 485/13317 [04:58<2:11:17, 1.63it/s] {'loss': 0.3784, 'grad_norm': 0.5892598628997803, 'learning_rate': 3.6411411411411413e-06, 'epoch': 0.11} 4%|▎ | 485/13317 [04:58<2:11:17, 1.63it/s] 4%|▎ | 486/13317 [04:59<2:11:54, 1.62it/s] {'loss': 0.3932, 'grad_norm': 0.6718635559082031, 'learning_rate': 3.648648648648649e-06, 'epoch': 0.11} 4%|▎ | 486/13317 [04:59<2:11:54, 1.62it/s] 4%|▎ | 487/13317 [05:00<2:11:45, 1.62it/s] {'loss': 0.4165, 'grad_norm': 0.7494131922721863, 'learning_rate': 3.6561561561561565e-06, 'epoch': 0.11} 4%|▎ | 487/13317 [05:00<2:11:45, 1.62it/s] 4%|▎ | 488/13317 [05:00<2:11:37, 1.62it/s] {'loss': 0.4688, 'grad_norm': 0.6745133996009827, 'learning_rate': 3.663663663663664e-06, 'epoch': 0.11} 4%|▎ | 488/13317 [05:00<2:11:37, 1.62it/s] 4%|▎ | 489/13317 [05:01<2:11:30, 1.63it/s] {'loss': 0.4248, 'grad_norm': 0.698136568069458, 'learning_rate': 3.6711711711711716e-06, 'epoch': 0.11} 4%|▎ | 489/13317 [05:01<2:11:30, 1.63it/s] 4%|▎ | 490/13317 [05:01<2:11:27, 1.63it/s] {'loss': 0.4287, 'grad_norm': 0.6066250205039978, 'learning_rate': 3.6786786786786788e-06, 'epoch': 0.11} 4%|▎ | 490/13317 [05:02<2:11:27, 1.63it/s] 4%|▎ | 491/13317 [05:02<2:11:34, 1.62it/s] {'loss': 0.4886, 'grad_norm': 0.7168146967887878, 'learning_rate': 3.6861861861861863e-06, 'epoch': 0.11} 4%|▎ | 491/13317 [05:02<2:11:34, 1.62it/s] 4%|▎ | 492/13317 [05:03<2:11:22, 1.63it/s] {'loss': 0.4494, 'grad_norm': 0.7278876304626465, 'learning_rate': 3.693693693693694e-06, 'epoch': 0.11} 4%|▎ | 492/13317 [05:03<2:11:22, 1.63it/s] 4%|▎ | 493/13317 [05:03<2:11:29, 1.63it/s] {'loss': 0.3935, 'grad_norm': 0.7472261786460876, 'learning_rate': 3.7012012012012015e-06, 'epoch': 0.11} 4%|▎ | 493/13317 [05:03<2:11:29, 1.63it/s] 4%|▎ | 494/13317 [05:04<2:11:25, 1.63it/s] {'loss': 0.4268, 'grad_norm': 0.6780577301979065, 'learning_rate': 3.708708708708709e-06, 'epoch': 0.11} 4%|▎ | 494/13317 [05:04<2:11:25, 1.63it/s] 4%|▎ | 495/13317 [05:04<2:11:19, 1.63it/s] {'loss': 0.4721, 'grad_norm': 0.7228316068649292, 'learning_rate': 3.7162162162162162e-06, 'epoch': 0.11} 4%|▎ | 495/13317 [05:05<2:11:19, 1.63it/s] 4%|▎ | 496/13317 [05:05<2:11:17, 1.63it/s] {'loss': 0.3736, 'grad_norm': 0.65328449010849, 'learning_rate': 3.723723723723724e-06, 'epoch': 0.11} 4%|▎ | 496/13317 [05:05<2:11:17, 1.63it/s] 4%|▎ | 497/13317 [05:06<2:11:13, 1.63it/s] {'loss': 0.4561, 'grad_norm': 0.7401781678199768, 'learning_rate': 3.7312312312312314e-06, 'epoch': 0.11} 4%|▎ | 497/13317 [05:06<2:11:13, 1.63it/s] 4%|▎ | 498/13317 [05:06<2:11:23, 1.63it/s] {'loss': 0.3764, 'grad_norm': 0.6361309289932251, 'learning_rate': 3.7387387387387394e-06, 'epoch': 0.11} 4%|▎ | 498/13317 [05:06<2:11:23, 1.63it/s] 4%|▎ | 499/13317 [05:07<2:11:20, 1.63it/s] {'loss': 0.4862, 'grad_norm': 0.6940679550170898, 'learning_rate': 3.746246246246247e-06, 'epoch': 0.11} 4%|▎ | 499/13317 [05:07<2:11:20, 1.63it/s] 4%|▍ | 500/13317 [05:08<2:11:14, 1.63it/s] {'loss': 0.4615, 'grad_norm': 0.7067543864250183, 'learning_rate': 3.7537537537537537e-06, 'epoch': 0.11} 4%|▍ | 500/13317 [05:08<2:11:14, 1.63it/s] 0%| | 0/13 [00:00