CINXE.COM
@article {Olsen2024.02.02.578678, author = {Olsen, Tobias H. and Moal, Iain H. and Deane, Charlotte M.}, title = {Addressing the antibody germline bias and its effect on language models for improved antibody design}, elocation-id = {2024.02.02.578678}, year = {2024}, doi = {10.1101/2024.02.02.578678}, publisher = {Cold Spring Harbor Laboratory}, abstract = {The versatile binding properties of antibodies have made them an extremely important class of biotherapeutics. However, therapeutic antibody development is a complex, expensive and time-consuming task, with the final antibody needing to not only have strong and specific binding, but also be minimally impacted by any developability issues. The success of transformer-based language models in protein sequence space and the availability of vast amounts of antibody sequences, has led to the development of many antibody-specific language models to help guide antibody discovery and design. Antibody diversity primarily arises from V(D)J recombination, mutations within the CDRs, and/or from a small number of mutations away from the germline outside the CDRs. Consequently, a significant portion of the variable domain of all natural antibody sequences remains germline. This affects the pre-training of antibody-specific language models, where this facet of the sequence data introduces a prevailing bias towards germline residues. This poses a challenge, as mutations away from the germline are often vital for generating specific and potent binding to a target, meaning that language models need be able to suggest key mutations away from germline.In this study, we explore the implications of the germline bias, examining its impact on both general-protein and antibody-specific language models. We develop and train a series of new antibody-specific language models optimised for predicting non-germline residues. We then compare our final model, AbLang-2, with current models and show how it suggests a diverse set of valid mutations with high cumulative probability. AbLang-2 is trained on both unpaired and paired data, and is freely available (https://github.com/oxpig/AbLang2.git).Competing Interest StatementAuthor IM is employed by GlaxoSmithKline plc. All authors declare no other competing interests.}, URL = {https://www.biorxiv.org/content/early/2024/02/07/2024.02.02.578678}, eprint = {https://www.biorxiv.org/content/early/2024/02/07/2024.02.02.578678.full.pdf}, journal = {bioRxiv} }