@misc{sumner_he_thakkar_engkvist_bjerrum_2020, title={Levenshtein Augmentation Improves Performance of SMILES Based Deep-Learning Synthesis Prediction}, DOI={10.26434/chemrxiv.12562121.v2}, abstractNote={<p>SMILES randomization, a form of data augmentation, has previously been shown to increase the performance of deep learning models compared to non-augmented baselines. Here, we propose a novel data augmentation method we call "Levenshtein augmentation" which considers local SMILES sub-sequence similarity between reactants and their respective products when creating training pairs. The performance of Levenshtein augmentation was tested using two state of the art models - transformer and sequence-to-sequence based recurrent neural networks with attention. Levenshtein augmentation demonstrated an increase performance over non-augmented, and conventionally SMILES randomization augmented data when used for training of baseline models. Furthermore, Levenshtein augmentation seemingly results in what we define as <i>attentional gain </i>– an enhancement in the pattern recognition capabilities of the underlying network to molecular motifs.</p>}, publisher={American Chemical Society (ACS)}, author={Sumner, Dean and He, Jiazhen and Thakkar, Amol and Engkvist, Ola and Bjerrum, Esben Jannik}, year={2020}, month={Jul} }