|
|
|
|
@ -77,3 +77,16 @@ Red: RWKV ("linear" attention) - VRAM friendly - quite faster when ctx window is
|
|
|
|
|
Black: MHA_pro (MHA with various tweaks & RWKV-type-FFN) - slow - needs more VRAM - good performance.
|
|
|
|
|
|
|
|
|
|
parameters count: 17.2 vs 18.5 vs 18.5.
|
|
|
|
|
|
|
|
|
|
```
|
|
|
|
|
@software{peng_bo_2021_5196578,
|
|
|
|
|
author = {PENG Bo},
|
|
|
|
|
title = {BlinkDL/RWKV-LM: 0.01},
|
|
|
|
|
month = aug,
|
|
|
|
|
year = 2021,
|
|
|
|
|
publisher = {Zenodo},
|
|
|
|
|
version = {0.01},
|
|
|
|
|
doi = {10.5281/zenodo.5196578},
|
|
|
|
|
url = {https://doi.org/10.5281/zenodo.5196578}
|
|
|
|
|
}
|
|
|
|
|
```
|
|
|
|
|
|