CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 130 results for author: <span class="mathjax">Ren, M</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Ren%2C+M">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Ren, M"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Ren%2C+M&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Ren, M"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Ren%2C+M&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Ren%2C+M&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Ren%2C+M&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Ren%2C+M&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14384">arXiv:2411.14384</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.14384">pdf</a>, <a href="https://arxiv.org/format/2411.14384">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Baking Gaussian Splatting into Diffusion Denoiser for Fast and Scalable Single-stage Image-to-3D Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Cai%2C+Y">Yuanhao Cai</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">He Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Liang%2C+Y">Yixun Liang</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Mengwei Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Luan%2C+F">Fujun Luan</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Q">Qing Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+S+Y">Soo Ye Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jianming Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zhifei Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+Y">Yuqian Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Z">Zhe Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Yuille%2C+A">Alan Yuille</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14384v1-abstract-short" style="display: inline;"> Existing feed-forward image-to-3D methods mainly rely on 2D multi-view diffusion models that cannot guarantee 3D consistency. These methods easily collapse when changing the prompt view direction and mainly handle object-centric prompt images. In this paper, we propose a novel single-stage 3D diffusion model, DiffusionGS, for object and scene generation from a single view. DiffusionGS directly out&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14384v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14384v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14384v1-abstract-full" style="display: none;"> Existing feed-forward image-to-3D methods mainly rely on 2D multi-view diffusion models that cannot guarantee 3D consistency. These methods easily collapse when changing the prompt view direction and mainly handle object-centric prompt images. In this paper, we propose a novel single-stage 3D diffusion model, DiffusionGS, for object and scene generation from a single view. DiffusionGS directly outputs 3D Gaussian point clouds at each timestep to enforce view consistency and allow the model to generate robustly given prompt views of any directions, beyond object-centric inputs. Plus, to improve the capability and generalization ability of DiffusionGS, we scale up 3D training data by developing a scene-object mixed training strategy. Experiments show that our method enjoys better generation quality (2.20 dB higher in PSNR and 23.25 lower in FID) and over 5x faster speed (~6s on an A100 GPU) than SOTA methods. The user study and text-to-3D applications also reveals the practical values of our method. Our Project page at https://caiyuanhao1998.github.io/project/DiffusionGS/ shows the video and interactive generation results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14384v1-abstract-full').style.display = 'none'; document.getElementById('2411.14384v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">A novel one-stage 3DGS-based diffusion generates objects and scenes from a single view in ~6 seconds</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08324">arXiv:2411.08324</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.08324">pdf</a>, <a href="https://arxiv.org/format/2411.08324">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Are LLMs Prescient? A Continuous Evaluation using Daily News as the Oracle </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Dai%2C+H">Hui Dai</a>, <a href="/search/cs?searchtype=author&amp;query=Teehan%2C+R">Ryan Teehan</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Mengye Ren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08324v1-abstract-short" style="display: inline;"> Many existing evaluation benchmarks for Large Language Models (LLMs) quickly become outdated due to the emergence of new models and training data. These benchmarks also fall short in assessing how LLM performance changes over time, as they consist of static questions without a temporal dimension. To address these limitations, we propose using future event prediction as a continuous evaluation meth&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08324v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08324v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08324v1-abstract-full" style="display: none;"> Many existing evaluation benchmarks for Large Language Models (LLMs) quickly become outdated due to the emergence of new models and training data. These benchmarks also fall short in assessing how LLM performance changes over time, as they consist of static questions without a temporal dimension. To address these limitations, we propose using future event prediction as a continuous evaluation method to assess LLMs&#39; temporal generalization and forecasting abilities. Our benchmark, Daily Oracle, automatically generates question-answer (QA) pairs from daily news, challenging LLMs to predict &#34;future&#34; event outcomes. Our findings reveal that as pre-training data becomes outdated, LLM performance degrades over time. While Retrieval Augmented Generation (RAG) has the potential to enhance prediction accuracy, the performance degradation pattern persists, highlighting the need for continuous model updates. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08324v1-abstract-full').style.display = 'none'; document.getElementById('2411.08324v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05676">arXiv:2411.05676</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.05676">pdf</a>, <a href="https://arxiv.org/format/2411.05676">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Improving Molecular Graph Generation with Flow Matching and Optimal Transport </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hou%2C+X">Xiaoyang Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+T">Tian Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Milong Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Bu%2C+D">Dongbo Bu</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+X">Xin Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+C">Chunming Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+S">Shiwei Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05676v1-abstract-short" style="display: inline;"> Generating molecular graphs is crucial in drug design and discovery but remains challenging due to the complex interdependencies between nodes and edges. While diffusion models have demonstrated their potentiality in molecular graph design, they often suffer from unstable training and inefficient sampling. To enhance generation performance and training stability, we propose GGFlow, a discrete flow&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05676v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05676v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05676v1-abstract-full" style="display: none;"> Generating molecular graphs is crucial in drug design and discovery but remains challenging due to the complex interdependencies between nodes and edges. While diffusion models have demonstrated their potentiality in molecular graph design, they often suffer from unstable training and inefficient sampling. To enhance generation performance and training stability, we propose GGFlow, a discrete flow matching generative model incorporating optimal transport for molecular graphs and it incorporates an edge-augmented graph transformer to enable the direct communications among chemical bounds. Additionally, GGFlow introduces a novel goal-guided generation framework to control the generative trajectory of our model, aiming to design novel molecular structures with the desired properties. GGFlow demonstrates superior performance on both unconditional and conditional molecule generation tasks, outperforming existing baselines and underscoring its effectiveness and potential for wider application. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05676v1-abstract-full').style.display = 'none'; document.getElementById('2411.05676v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02445">arXiv:2411.02445</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.02445">pdf</a>, <a href="https://arxiv.org/format/2411.02445">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> WiCV@CVPR2024: The Thirteenth Women In Computer Vision Workshop at the Annual CVPR Conference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Aslam%2C+A">Asra Aslam</a>, <a href="/search/cs?searchtype=author&amp;query=Herath%2C+S">Sachini Herath</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Z">Ziqi Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Talavera%2C+E">Estefania Talavera</a>, <a href="/search/cs?searchtype=author&amp;query=Bhattacharjee%2C+D">Deblina Bhattacharjee</a>, <a href="/search/cs?searchtype=author&amp;query=Mittal%2C+H">Himangi Mittal</a>, <a href="/search/cs?searchtype=author&amp;query=Staderini%2C+V">Vanessa Staderini</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Mengwei Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Farshad%2C+A">Azade Farshad</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02445v1-abstract-short" style="display: inline;"> In this paper, we present the details of Women in Computer Vision Workshop - WiCV 2024, organized alongside the CVPR 2024 in Seattle, Washington, United States. WiCV aims to amplify the voices of underrepresented women in the computer vision community, fostering increased visibility in both academia and industry. We believe that such events play a vital role in addressing gender imbalances within&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02445v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02445v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02445v1-abstract-full" style="display: none;"> In this paper, we present the details of Women in Computer Vision Workshop - WiCV 2024, organized alongside the CVPR 2024 in Seattle, Washington, United States. WiCV aims to amplify the voices of underrepresented women in the computer vision community, fostering increased visibility in both academia and industry. We believe that such events play a vital role in addressing gender imbalances within the field. The annual WiCV@CVPR workshop offers a)~opportunity for collaboration between researchers from minority groups, b) mentorship for female junior researchers, c) financial support to presenters to alleviate financial burdens and d)~a diverse array of role models who can inspire younger researchers at the outset of their careers. In this paper, we present a comprehensive report on the workshop program, historical trends from the past WiCV@CVPR events, and a summary of statistics related to presenters, attendees, and sponsorship for the WiCV 2024 workshop. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02445v1-abstract-full').style.display = 'none'; document.getElementById('2411.02445v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: substantial text overlap with arXiv:2309.12768</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02372">arXiv:2411.02372</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.02372">pdf</a>, <a href="https://arxiv.org/format/2411.02372">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Learning General-Purpose Biomedical Volume Representations using Randomized Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Dey%2C+N">Neel Dey</a>, <a href="/search/cs?searchtype=author&amp;query=Billot%2C+B">Benjamin Billot</a>, <a href="/search/cs?searchtype=author&amp;query=Wong%2C+H+E">Hallee E. Wong</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+C+J">Clinton J. Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Mengwei Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Grant%2C+P+E">P. Ellen Grant</a>, <a href="/search/cs?searchtype=author&amp;query=Dalca%2C+A+V">Adrian V. Dalca</a>, <a href="/search/cs?searchtype=author&amp;query=Golland%2C+P">Polina Golland</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02372v1-abstract-short" style="display: inline;"> Current volumetric biomedical foundation models struggle to generalize as public 3D datasets are small and do not cover the broad diversity of medical procedures, conditions, anatomical regions, and imaging protocols. We address this by creating a representation learning method that instead anticipates strong domain shifts at training time itself. We first propose a data engine that synthesizes hi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02372v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02372v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02372v1-abstract-full" style="display: none;"> Current volumetric biomedical foundation models struggle to generalize as public 3D datasets are small and do not cover the broad diversity of medical procedures, conditions, anatomical regions, and imaging protocols. We address this by creating a representation learning method that instead anticipates strong domain shifts at training time itself. We first propose a data engine that synthesizes highly variable training samples that enable generalization to new biomedical contexts. To then train a single 3D network for any voxel-level task, we develop a contrastive learning method that pretrains the network to be stable against nuisance imaging variation simulated by the data engine, a key inductive bias for generalization. This network&#39;s features can be used as robust representations of input images for downstream tasks and its weights provide a strong, dataset-agnostic initialization for finetuning on new datasets. As a result, we set new standards across both multimodality registration and few-shot segmentation, a first for any 3D biomedical vision model, all without (pre-)training on any existing dataset of real images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02372v1-abstract-full').style.display = 'none'; document.getElementById('2411.02372v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code and model weights available at https://github.com/neel-dey/anatomix</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12896">arXiv:2410.12896</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.12896">pdf</a>, <a href="https://arxiv.org/format/2410.12896">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> A Survey on Data Synthesis and Augmentation for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+K">Ke Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+J">Jiahui Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Minjie Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zeming Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+S">Shiwei Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zongye Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+C">Chenkai Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+X">Xiaoyu Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhan%2C+Q">Qiqi Zhan</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Q">Qingjie Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yunhong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12896v1-abstract-short" style="display: inline;"> The success of Large Language Models (LLMs) is inherently linked to the availability of vast, diverse, and high-quality data for training and evaluation. However, the growth rate of high-quality data is significantly outpaced by the expansion of training datasets, leading to a looming data exhaustion crisis. This underscores the urgent need to enhance data efficiency and explore new data sources.&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12896v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12896v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12896v1-abstract-full" style="display: none;"> The success of Large Language Models (LLMs) is inherently linked to the availability of vast, diverse, and high-quality data for training and evaluation. However, the growth rate of high-quality data is significantly outpaced by the expansion of training datasets, leading to a looming data exhaustion crisis. This underscores the urgent need to enhance data efficiency and explore new data sources. In this context, synthetic data has emerged as a promising solution. Currently, data generation primarily consists of two major approaches: data augmentation and synthesis. This paper comprehensively reviews and summarizes data generation techniques throughout the lifecycle of LLMs, including data preparation, pre-training, fine-tuning, instruction-tuning, preference alignment, and applications. Furthermore, We discuss the current constraints faced by these methods and investigate potential pathways for future development and research. Our aspiration is to equip researchers with a clear understanding of these methodologies, enabling them to swiftly identify appropriate data generation strategies in the construction of LLMs, while providing valuable insights for future exploration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12896v1-abstract-full').style.display = 'none'; document.getElementById('2410.12896v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05525">arXiv:2410.05525</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.05525">pdf</a>, <a href="https://arxiv.org/format/2410.05525">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Generative Portrait Shadow Removal </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yoon%2C+J+S">Jae Shin Yoon</a>, <a href="/search/cs?searchtype=author&amp;query=Shu%2C+Z">Zhixin Shu</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Mengwei Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xuaner Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Hold-Geoffroy%2C+Y">Yannick Hold-Geoffroy</a>, <a href="/search/cs?searchtype=author&amp;query=Singh%2C+K+K">Krishna Kumar Singh</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">He Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05525v1-abstract-short" style="display: inline;"> We introduce a high-fidelity portrait shadow removal model that can effectively enhance the image of a portrait by predicting its appearance under disturbing shadows and highlights. Portrait shadow removal is a highly ill-posed problem where multiple plausible solutions can be found based on a single image. While existing works have solved this problem by predicting the appearance residuals that c&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05525v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05525v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05525v1-abstract-full" style="display: none;"> We introduce a high-fidelity portrait shadow removal model that can effectively enhance the image of a portrait by predicting its appearance under disturbing shadows and highlights. Portrait shadow removal is a highly ill-posed problem where multiple plausible solutions can be found based on a single image. While existing works have solved this problem by predicting the appearance residuals that can propagate local shadow distribution, such methods are often incomplete and lead to unnatural predictions, especially for portraits with hard shadows. We overcome the limitations of existing local propagation methods by formulating the removal problem as a generation task where a diffusion model learns to globally rebuild the human appearance from scratch as a condition of an input portrait image. For robust and natural shadow removal, we propose to train the diffusion model with a compositional repurposing framework: a pre-trained text-guided image generation model is first fine-tuned to harmonize the lighting and color of the foreground with a background scene by using a background harmonization dataset; and then the model is further fine-tuned to generate a shadow-free portrait image via a shadow-paired dataset. To overcome the limitation of losing fine details in the latent diffusion model, we propose a guided-upsampling network to restore the original high-frequency details (wrinkles and dots) from the input image. To enable our compositional training framework, we construct a high-fidelity and large-scale dataset using a lightstage capturing system and synthetic graphics simulation. Our generative framework effectively removes shadows caused by both self and external occlusions while maintaining original lighting distribution and high-frequency details. Our method also demonstrates robustness to diverse subjects captured in real environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05525v1-abstract-full').style.display = 'none'; document.getElementById('2410.05525v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, siggraph asia, TOG</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02309">arXiv:2410.02309</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2410.02309">pdf</a>, <a href="https://arxiv.org/format/2410.02309">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Decoupling Layout from Glyph in Online Chinese Handwriting Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Min-Si Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yan-Ming Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yi Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02309v2-abstract-short" style="display: inline;"> Text plays a crucial role in the transmission of human civilization, and teaching machines to generate online handwritten text in various styles presents an interesting and significant challenge. However, most prior work has concentrated on generating individual Chinese fonts, leaving {complete text line generation largely unexplored}. In this paper, we identify that text lines can naturally be di&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02309v2-abstract-full').style.display = 'inline'; document.getElementById('2410.02309v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02309v2-abstract-full" style="display: none;"> Text plays a crucial role in the transmission of human civilization, and teaching machines to generate online handwritten text in various styles presents an interesting and significant challenge. However, most prior work has concentrated on generating individual Chinese fonts, leaving {complete text line generation largely unexplored}. In this paper, we identify that text lines can naturally be divided into two components: layout and glyphs. Based on this division, we designed a text line layout generator coupled with a diffusion-based stylized font synthesizer to address this challenge hierarchically. More concretely, the layout generator performs in-context-like learning based on the text content and the provided style references to generate positions for each glyph autoregressively. Meanwhile, the font synthesizer which consists of a character embedding dictionary, a multi-scale calligraphy style encoder, and a 1D U-Net based diffusion denoiser will generate each font on its position while imitating the calligraphy style extracted from the given style references. Qualitative and quantitative experiments on the CASIA-OLHWDB demonstrate that our method is capable of generating structurally correct and indistinguishable imitation samples. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02309v2-abstract-full').style.display = 'none'; document.getElementById('2410.02309v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12311">arXiv:2409.12311</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.12311">pdf</a>, <a href="https://arxiv.org/format/2409.12311">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Towards Closing the Loop in Robotic Pollination for Indoor Farming via Autonomous Microscopic Inspection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kong%2C+C">Chuizheng Kong</a>, <a href="/search/cs?searchtype=author&amp;query=Qiu%2C+A">Alex Qiu</a>, <a href="/search/cs?searchtype=author&amp;query=Wibowo%2C+I">Idris Wibowo</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Marvin Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Dhori%2C+A">Aishik Dhori</a>, <a href="/search/cs?searchtype=author&amp;query=Ling%2C+K">Kai-Shu Ling</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+A">Ai-Ping Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Kousik%2C+S">Shreyas Kousik</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12311v1-abstract-short" style="display: inline;"> Effective pollination is a key challenge for indoor farming, since bees struggle to navigate without the sun. While a variety of robotic system solutions have been proposed, it remains difficult to autonomously check that a flower has been sufficiently pollinated to produce high-quality fruit, which is especially critical for self-pollinating crops such as strawberries. To this end, this work prop&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12311v1-abstract-full').style.display = 'inline'; document.getElementById('2409.12311v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12311v1-abstract-full" style="display: none;"> Effective pollination is a key challenge for indoor farming, since bees struggle to navigate without the sun. While a variety of robotic system solutions have been proposed, it remains difficult to autonomously check that a flower has been sufficiently pollinated to produce high-quality fruit, which is especially critical for self-pollinating crops such as strawberries. To this end, this work proposes a novel robotic system for indoor farming. The proposed hardware combines a 7-degree-of-freedom (DOF) manipulator arm with a custom end-effector, comprised of an endoscope camera, a 2-DOF microscope subsystem, and a custom vibrating pollination tool; this is paired with algorithms to detect and estimate the pose of strawberry flowers, navigate to each flower, pollinate using the tool, and inspect with the microscope. The key novelty is vibrating the flower from below while simultaneously inspecting with a microscope from above. Each subsystem is validated via extensive experiments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12311v1-abstract-full').style.display = 'none'; document.getElementById('2409.12311v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09715">arXiv:2409.09715</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2409.09715">pdf</a>, <a href="https://arxiv.org/ps/2409.09715">ps</a>, <a href="https://arxiv.org/format/2409.09715">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> </div> </div> <p class="title is-5 mathjax"> Generative Semantic Communication via Textual Prompts: Latency Performance Tradeoffs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Mengmeng Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Qiao%2C+L">Li Qiao</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+L">Long Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+Z">Zhen Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J">Jian Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Mashhadi%2C+M+B">Mahdi Boloursaz Mashhadi</a>, <a href="/search/cs?searchtype=author&amp;query=Xiao%2C+P">Pei Xiao</a>, <a href="/search/cs?searchtype=author&amp;query=Tafazolli%2C+R">Rahim Tafazolli</a>, <a href="/search/cs?searchtype=author&amp;query=Bennis%2C+M">Mehdi Bennis</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09715v1-abstract-short" style="display: inline;"> This paper develops an edge-device collaborative Generative Semantic Communications (Gen SemCom) framework leveraging pre-trained Multi-modal/Vision Language Models (M/VLMs) for ultra-low-rate semantic communication via textual prompts. The proposed framework optimizes the use of M/VLMs on the wireless edge/device to generate high-fidelity textual prompts through visual captioning/question answeri&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09715v1-abstract-full').style.display = 'inline'; document.getElementById('2409.09715v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09715v1-abstract-full" style="display: none;"> This paper develops an edge-device collaborative Generative Semantic Communications (Gen SemCom) framework leveraging pre-trained Multi-modal/Vision Language Models (M/VLMs) for ultra-low-rate semantic communication via textual prompts. The proposed framework optimizes the use of M/VLMs on the wireless edge/device to generate high-fidelity textual prompts through visual captioning/question answering, which are then transmitted over a wireless channel for SemCom. Specifically, we develop a multi-user Gen SemCom framework using pre-trained M/VLMs, and formulate a joint optimization problem of prompt generation offloading, communication and computation resource allocation to minimize the latency and maximize the resulting semantic quality. Due to the nonconvex nature of the problem with highly coupled discrete and continuous variables, we decompose it as a two-level problem and propose a low-complexity swap/leaving/joining (SLJ)-based matching algorithm. Simulation results demonstrate significant performance improvements over the conventional semanticunaware/non-collaborative offloading benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09715v1-abstract-full').style.display = 'none'; document.getElementById('2409.09715v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.11208">arXiv:2408.11208</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.11208">pdf</a>, <a href="https://arxiv.org/format/2408.11208">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> PooDLe: Pooled and dense self-supervised learning from naturalistic videos </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+A+N">Alex N. Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Hoang%2C+C">Christopher Hoang</a>, <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+Y">Yuwen Xiong</a>, <a href="/search/cs?searchtype=author&amp;query=LeCun%2C+Y">Yann LeCun</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Mengye Ren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.11208v1-abstract-short" style="display: inline;"> Self-supervised learning has driven significant progress in learning from single-subject, iconic images. However, there are still unanswered questions about the use of minimally-curated, naturalistic video data, which contain dense scenes with many independent objects, imbalanced class distributions, and varying object sizes. In this paper, we propose a novel approach that combines an invariance-b&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11208v1-abstract-full').style.display = 'inline'; document.getElementById('2408.11208v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.11208v1-abstract-full" style="display: none;"> Self-supervised learning has driven significant progress in learning from single-subject, iconic images. However, there are still unanswered questions about the use of minimally-curated, naturalistic video data, which contain dense scenes with many independent objects, imbalanced class distributions, and varying object sizes. In this paper, we propose a novel approach that combines an invariance-based SSL objective on pooled representations with a dense SSL objective that enforces equivariance to optical flow warping. Our findings indicate that a unified objective applied at multiple feature scales is essential for learning effective image representations from high-resolution, naturalistic videos. We validate our approach on the BDD100K driving video dataset and the Walking Tours first-person video dataset, demonstrating its ability to capture spatial understanding from a dense objective and semantic understanding via a pooled representation objective. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11208v1-abstract-full').style.display = 'none'; document.getElementById('2408.11208v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://poodle-ssl.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.03281">arXiv:2408.03281</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.03281">pdf</a>, <a href="https://arxiv.org/format/2408.03281">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> StructEval: Deepen and Broaden Large Language Model Assessment via Structured Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Cao%2C+B">Boxi Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Mengjie Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+H">Hongyu Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+X">Xianpei Han</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+F">Feng Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhan%2C+J">Junfeng Zhan</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Le Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.03281v2-abstract-short" style="display: inline;"> Evaluation is the baton for the development of large language models. Current evaluations typically employ a single-item assessment paradigm for each atomic test objective, which struggles to discern whether a model genuinely possesses the required capabilities or merely memorizes/guesses the answers to specific questions. To this end, we propose a novel evaluation framework referred to as StructE&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03281v2-abstract-full').style.display = 'inline'; document.getElementById('2408.03281v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.03281v2-abstract-full" style="display: none;"> Evaluation is the baton for the development of large language models. Current evaluations typically employ a single-item assessment paradigm for each atomic test objective, which struggles to discern whether a model genuinely possesses the required capabilities or merely memorizes/guesses the answers to specific questions. To this end, we propose a novel evaluation framework referred to as StructEval. Starting from an atomic test objective, StructEval deepens and broadens the evaluation by conducting a structured assessment across multiple cognitive levels and critical concepts, and therefore offers a comprehensive, robust and consistent evaluation for LLMs. Experiments on three widely-used benchmarks demonstrate that StructEval serves as a reliable tool for resisting the risk of data contamination and reducing the interference of potential biases, thereby providing more reliable and consistent conclusions regarding model capabilities. Our framework also sheds light on the design of future principled and trustworthy LLM evaluation protocols. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.03281v2-abstract-full').style.display = 'none'; document.getElementById('2408.03281v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACL 2024;Benchmark at https://github.com/c-box/StructEval ;Leaderboard at https://huggingface.co/spaces/Bowieee/StructEval_leaderboard</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.02226">arXiv:2408.02226</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2408.02226">pdf</a>, <a href="https://arxiv.org/format/2408.02226">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ProCreate, Don&#39;t Reproduce! Propulsive Energy Diffusion for Creative Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lu%2C+J">Jack Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Teehan%2C+R">Ryan Teehan</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Mengye Ren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.02226v2-abstract-short" style="display: inline;"> In this paper, we propose ProCreate, a simple and easy-to-implement method to improve sample diversity and creativity of diffusion-based image generative models and to prevent training data reproduction. ProCreate operates on a set of reference images and actively propels the generated image embedding away from the reference embeddings during the generation process. We propose FSCG-8 (Few-Shot Cre&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02226v2-abstract-full').style.display = 'inline'; document.getElementById('2408.02226v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.02226v2-abstract-full" style="display: none;"> In this paper, we propose ProCreate, a simple and easy-to-implement method to improve sample diversity and creativity of diffusion-based image generative models and to prevent training data reproduction. ProCreate operates on a set of reference images and actively propels the generated image embedding away from the reference embeddings during the generation process. We propose FSCG-8 (Few-Shot Creative Generation 8), a few-shot creative generation dataset on eight different categories -- encompassing different concepts, styles, and settings -- in which ProCreate achieves the highest sample diversity and fidelity. Furthermore, we show that ProCreate is effective at preventing replicating training data in a large-scale evaluation using training text prompts. Code and FSCG-8 are available at https://github.com/Agentic-Learning-AI-Lab/procreate-diffusion-public. The project page is available at https://procreate-diffusion.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.02226v2-abstract-full').style.display = 'none'; document.getElementById('2408.02226v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ECCV 2024. Project page: https://procreate-diffusion.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.20741">arXiv:2407.20741</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.20741">pdf</a>, <a href="https://arxiv.org/format/2407.20741">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Dynamical Systems">math.DS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Numerical Analysis">math.NA</span> </div> </div> <p class="title is-5 mathjax"> Improving PINNs By Algebraic Inclusion of Boundary and Initial Conditions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Mohan Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Fang%2C+Z">Zhihao Fang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+K">Keren Li</a>, <a href="/search/cs?searchtype=author&amp;query=Mukherjee%2C+A">Anirbit Mukherjee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.20741v1-abstract-short" style="display: inline;"> &#34;AI for Science&#34; aims to solve fundamental scientific problems using AI techniques. As most physical phenomena can be described as Partial Differential Equations (PDEs) , approximating their solutions using neural networks has evolved as a central component of scientific-ML. Physics-Informed Neural Networks (PINNs) is the general method that has evolved for this task but its training is well-known&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20741v1-abstract-full').style.display = 'inline'; document.getElementById('2407.20741v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.20741v1-abstract-full" style="display: none;"> &#34;AI for Science&#34; aims to solve fundamental scientific problems using AI techniques. As most physical phenomena can be described as Partial Differential Equations (PDEs) , approximating their solutions using neural networks has evolved as a central component of scientific-ML. Physics-Informed Neural Networks (PINNs) is the general method that has evolved for this task but its training is well-known to be very unstable. In this work we explore the possibility of changing the model being trained from being just a neural network to being a non-linear transformation of it - one that algebraically includes the boundary/initial conditions. This reduces the number of terms in the loss function than the standard PINN losses. We demonstrate that our modification leads to significant performance gains across a range of benchmark tasks, in various dimensions and without having to tweak the training algorithm. Our conclusions are based on conducting hundreds of experiments, in the fully unsupervised setting, over multiple linear and non-linear PDEs set to exactly solvable scenarios, which lends to a concrete measurement of our performance gains in terms of order(s) of magnitude lower fractional errors being achieved, than by standard PINNs. The code accompanying this manuscript is publicly available at, https://github.com/MorganREN/Improving-PINNs-By-Algebraic-Inclusion-of-Boundary-and-Initial-Conditions <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.20741v1-abstract-full').style.display = 'none'; document.getElementById('2407.20741v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">48 Pages, 25 Figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.06551">arXiv:2407.06551</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.06551">pdf</a>, <a href="https://arxiv.org/format/2407.06551">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> OffsetBias: Leveraging Debiased Data for Tuning Evaluators </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Park%2C+J">Junsoo Park</a>, <a href="/search/cs?searchtype=author&amp;query=Jwa%2C+S">Seungyeon Jwa</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Meiying Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+D">Daeyoung Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Choi%2C+S">Sanghyuk Choi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.06551v2-abstract-short" style="display: inline;"> Employing Large Language Models (LLMs) to assess the quality of generated responses, such as prompting instruct-tuned models or fine-tuning judge models, has become a widely adopted evaluation method. It is also known that such evaluators are vulnerable to biases, such as favoring longer responses. While it is important to overcome this problem, the specifics of these biases remain under-explored.&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06551v2-abstract-full').style.display = 'inline'; document.getElementById('2407.06551v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.06551v2-abstract-full" style="display: none;"> Employing Large Language Models (LLMs) to assess the quality of generated responses, such as prompting instruct-tuned models or fine-tuning judge models, has become a widely adopted evaluation method. It is also known that such evaluators are vulnerable to biases, such as favoring longer responses. While it is important to overcome this problem, the specifics of these biases remain under-explored. In this work, we qualitatively identify six types of biases inherent in various judge models. We propose EvalBiasBench as a meta-evaluation collection of hand-crafted test cases for each bias type. Additionally, we present de-biasing dataset construction methods and the associated preference dataset OffsetBias. Experimental results demonstrate that fine-tuning on our dataset significantly enhances the robustness of judge models against biases and improves performance across most evaluation scenarios. We release our datasets and the fine-tuned judge model to public. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.06551v2-abstract-full').style.display = 'none'; document.getElementById('2407.06551v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">EMNLP2024 Findings</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.00322">arXiv:2407.00322</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2407.00322">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> LLM-Generated Natural Language Meets Scaling Laws: New Explorations and Data Augmentation Methods </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zhenhua Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+G">Guang Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Ming Ren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.00322v1-abstract-short" style="display: inline;"> With the ascent of large language models (LLM), natural language processing has witnessed enhancements, such as LLM-based data augmentation. Nonetheless, prior research harbors two primary concerns: firstly, a lack of contemplation regarding whether the natural language generated by LLM (LLMNL) truly aligns with human natural language (HNL), a critical foundational question; secondly, an oversight&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00322v1-abstract-full').style.display = 'inline'; document.getElementById('2407.00322v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.00322v1-abstract-full" style="display: none;"> With the ascent of large language models (LLM), natural language processing has witnessed enhancements, such as LLM-based data augmentation. Nonetheless, prior research harbors two primary concerns: firstly, a lack of contemplation regarding whether the natural language generated by LLM (LLMNL) truly aligns with human natural language (HNL), a critical foundational question; secondly, an oversight that augmented data is randomly generated by LLM, implying that not all data may possess equal training value, that could impede the performance of classifiers. To address these challenges, we introduce the scaling laws to intrinsically calculate LLMNL and HNL. Through extensive experiments, we reveal slight deviations (approximately 0.2 Mandelbrot exponent) from Mandelbrot&#39;s law in LLMNL, underscore a complexity advantage in HNL, and supplement an interpretive discussion on language style. This establishes a solid foundation for LLM&#39;s expansion. Further, we introduce a novel data augmentation method for few-shot text classification, termed ZGPTDA, which leverages fuzzy computing mechanisms driven by the conformity to scaling laws to make decisions about GPT-4 augmented data. Extensive experiments, conducted in real-world scenarios, confirms the effectiveness (improving F1 of Bert and RoBerta by 7-10%) and competitiveness (surpassing recent AugGPT and GENCO methods by about 2% accuracy on DeBerta) of ZGPTDA. In addition, we reveal some interesting insights, e.g., Hilberg&#39;s law and Taylor&#39;s law can impart more benefits to text classification, etc. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00322v1-abstract-full').style.display = 'none'; document.getElementById('2407.00322v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.18144">arXiv:2406.18144</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.18144">pdf</a>, <a href="https://arxiv.org/format/2406.18144">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1007/s11263-024-02153-0">10.1007/s11263-024-02153-0 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Artificial Immune System of Secure Face Recognition Against Adversarial Attacks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Min Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yunlong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+Y">Yuhao Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Y">Yongzhen Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+Z">Zhenan Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Q">Qi Li</a>, <a href="/search/cs?searchtype=author&amp;query=Tan%2C+T">Tieniu Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.18144v1-abstract-short" style="display: inline;"> Insect production for food and feed presents a promising supplement to ensure food safety and address the adverse impacts of agriculture on climate and environment in the future. However, optimisation is required for insect production to realise its full potential. This can be by targeted improvement of traits of interest through selective breeding, an approach which has so far been underexplored&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18144v1-abstract-full').style.display = 'inline'; document.getElementById('2406.18144v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.18144v1-abstract-full" style="display: none;"> Insect production for food and feed presents a promising supplement to ensure food safety and address the adverse impacts of agriculture on climate and environment in the future. However, optimisation is required for insect production to realise its full potential. This can be by targeted improvement of traits of interest through selective breeding, an approach which has so far been underexplored and underutilised in insect farming. Here we present a comprehensive review of the selective breeding framework in the context of insect production. We systematically evaluate adjustments of selective breeding techniques to the realm of insects and highlight the essential components integral to the breeding process. The discussion covers every step of a conventional breeding scheme, such as formulation of breeding objectives, phenotyping, estimation of genetic parameters and breeding values, selection of appropriate breeding strategies, and mitigation of issues associated with genetic diversity depletion and inbreeding. This review combines knowledge from diverse disciplines, bridging the gap between animal breeding, quantitative genetics, evolutionary biology, and entomology, offering an integrated view of the insect breeding research area and uniting knowledge which has previously remained scattered across diverse fields of expertise. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18144v1-abstract-full').style.display = 'none'; document.getElementById('2406.18144v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> International Journal of Computer Vision (IJCV), 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.08980">arXiv:2406.08980</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.08980">pdf</a>, <a href="https://arxiv.org/format/2406.08980">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> From Theory to Therapy: Reframing SBDD Model Evaluation via Practical Metrics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gao%2C+B">Bowen Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Tan%2C+H">Haichuan Tan</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Y">Yanwen Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Minsi Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+X">Xiao Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+W">Wei-Ying Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Ya-Qin Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+Y">Yanyan Lan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.08980v1-abstract-short" style="display: inline;"> Recent advancements in structure-based drug design (SBDD) have significantly enhanced the efficiency and precision of drug discovery by generating molecules tailored to bind specific protein pockets. Despite these technological strides, their practical application in real-world drug development remains challenging due to the complexities of synthesizing and testing these molecules. The reliability&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08980v1-abstract-full').style.display = 'inline'; document.getElementById('2406.08980v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.08980v1-abstract-full" style="display: none;"> Recent advancements in structure-based drug design (SBDD) have significantly enhanced the efficiency and precision of drug discovery by generating molecules tailored to bind specific protein pockets. Despite these technological strides, their practical application in real-world drug development remains challenging due to the complexities of synthesizing and testing these molecules. The reliability of the Vina docking score, the current standard for assessing binding abilities, is increasingly questioned due to its susceptibility to overfitting. To address these limitations, we propose a comprehensive evaluation framework that includes assessing the similarity of generated molecules to known active compounds, introducing a virtual screening-based metric for practical deployment capabilities, and re-evaluating binding affinity more rigorously. Our experiments reveal that while current SBDD models achieve high Vina scores, they fall short in practical usability metrics, highlighting a significant gap between theoretical predictions and real-world applicability. Our proposed metrics and dataset aim to bridge this gap, enhancing the practical applicability of future SBDD models and aligning them more closely with the needs of pharmaceutical research and development. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08980v1-abstract-full').style.display = 'none'; document.getElementById('2406.08980v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.05613">arXiv:2406.05613</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.05613">pdf</a>, <a href="https://arxiv.org/format/2406.05613">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Distributed Motion Control of Multiple Mobile Manipulators for Reducing Interaction Wrench in Object Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+W">Wenhang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Meng Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+K">Kun Song</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+G">Gaoming Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+M+Y">Michael Yu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+Z">Zhenhua Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.05613v2-abstract-short" style="display: inline;"> In real-world cooperative manipulation of objects, multiple mobile manipulator systems may suffer from disturbances and asynchrony, leading to excessive interaction wrenches and potentially causing object damage or emergency stops. This paper presents a novel distributed motion control approach aimed at reducing these unnecessary interaction wrenches. The control strategy for each robot only utili&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05613v2-abstract-full').style.display = 'inline'; document.getElementById('2406.05613v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.05613v2-abstract-full" style="display: none;"> In real-world cooperative manipulation of objects, multiple mobile manipulator systems may suffer from disturbances and asynchrony, leading to excessive interaction wrenches and potentially causing object damage or emergency stops. This paper presents a novel distributed motion control approach aimed at reducing these unnecessary interaction wrenches. The control strategy for each robot only utilizes information from the local force sensors and neighboring robots, without the need for global position and velocity information. Disturbances are corrected through compensatory movements of the manipulators. Besides, the robustness of the control law against communication delays between robots is also considered. The stability of the control law is rigorously proven by the Lyapunov theorem. Subsequently, the efficacy of the proposed control law is validated through simulations and experiments of collaborative object manipulation by two robots. Experimental results demonstrate the effectiveness of the proposed control law in reducing interaction wrenches during object manipulation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.05613v2-abstract-full').style.display = 'none'; document.getElementById('2406.05613v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.01252">arXiv:2406.01252</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2406.01252">pdf</a>, <a href="https://arxiv.org/format/2406.01252">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Towards Scalable Automated Alignment of LLMs: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Cao%2C+B">Boxi Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+K">Keming Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+X">Xinyu Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J">Jiawei Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Mengjie Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Xiang%2C+H">Hao Xiang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+P">Peilin Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+Y">Yaojie Lu</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+B">Ben He</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+X">Xianpei Han</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Le Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+H">Hongyu Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+B">Bowen Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.01252v3-abstract-short" style="display: inline;"> Alignment is the most critical step in building large language models (LLMs) that meet human needs. With the rapid development of LLMs gradually surpassing human capabilities, traditional alignment methods based on human-annotation are increasingly unable to meet the scalability demands. Therefore, there is an urgent need to explore new sources of automated alignment signals and technical approach&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.01252v3-abstract-full').style.display = 'inline'; document.getElementById('2406.01252v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.01252v3-abstract-full" style="display: none;"> Alignment is the most critical step in building large language models (LLMs) that meet human needs. With the rapid development of LLMs gradually surpassing human capabilities, traditional alignment methods based on human-annotation are increasingly unable to meet the scalability demands. Therefore, there is an urgent need to explore new sources of automated alignment signals and technical approaches. In this paper, we systematically review the recently emerging methods of automated alignment, attempting to explore how to achieve effective, scalable, automated alignment once the capabilities of LLMs exceed those of humans. Specifically, we categorize existing automated alignment methods into 4 major categories based on the sources of alignment signals and discuss the current status and potential development of each category. Additionally, we explore the underlying mechanisms that enable automated alignment and discuss the essential factors that make automated alignment technologies feasible and effective from the fundamental role of alignment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.01252v3-abstract-full').style.display = 'none'; document.getElementById('2406.01252v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Paper List: https://github.com/cascip/awesome-auto-alignment</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.03178">arXiv:2405.03178</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2405.03178">pdf</a>, <a href="https://arxiv.org/format/2405.03178">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> POPDG: Popular 3D Dance Generation with PopDanceSet </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Luo%2C+Z">Zhenye Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Min Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+X">Xuecai Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Y">Yongzhen Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+L">Li Yao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.03178v1-abstract-short" style="display: inline;"> Generating dances that are both lifelike and well-aligned with music continues to be a challenging task in the cross-modal domain. This paper introduces PopDanceSet, the first dataset tailored to the preferences of young audiences, enabling the generation of aesthetically oriented dances. And it surpasses the AIST++ dataset in music genre diversity and the intricacy and depth of dance movements. M&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.03178v1-abstract-full').style.display = 'inline'; document.getElementById('2405.03178v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.03178v1-abstract-full" style="display: none;"> Generating dances that are both lifelike and well-aligned with music continues to be a challenging task in the cross-modal domain. This paper introduces PopDanceSet, the first dataset tailored to the preferences of young audiences, enabling the generation of aesthetically oriented dances. And it surpasses the AIST++ dataset in music genre diversity and the intricacy and depth of dance movements. Moreover, the proposed POPDG model within the iDDPM framework enhances dance diversity and, through the Space Augmentation Algorithm, strengthens spatial physical connections between human body joints, ensuring that increased diversity does not compromise generation quality. A streamlined Alignment Module is also designed to improve the temporal alignment between dance and music. Extensive experiments show that POPDG achieves SOTA results on two datasets. Furthermore, the paper also expands on current evaluation metrics. The dataset and code are available at https://github.com/Luke-Luo1/POPDG. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.03178v1-abstract-full').style.display = 'none'; document.getElementById('2405.03178v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.19132">arXiv:2404.19132</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.19132">pdf</a>, <a href="https://arxiv.org/format/2404.19132">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Integrating Present and Past in Unsupervised Continual Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yipeng Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Charlin%2C+L">Laurent Charlin</a>, <a href="/search/cs?searchtype=author&amp;query=Zemel%2C+R">Richard Zemel</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Mengye Ren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.19132v2-abstract-short" style="display: inline;"> We formulate a unifying framework for unsupervised continual learning (UCL), which disentangles learning objectives that are specific to the present and the past data, encompassing stability, plasticity, and cross-task consolidation. The framework reveals that many existing UCL approaches overlook cross-task consolidation and try to balance plasticity and stability in a shared embedding space. Thi&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.19132v2-abstract-full').style.display = 'inline'; document.getElementById('2404.19132v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.19132v2-abstract-full" style="display: none;"> We formulate a unifying framework for unsupervised continual learning (UCL), which disentangles learning objectives that are specific to the present and the past data, encompassing stability, plasticity, and cross-task consolidation. The framework reveals that many existing UCL approaches overlook cross-task consolidation and try to balance plasticity and stability in a shared embedding space. This results in worse performance due to a lack of within-task data diversity and reduced effectiveness in learning the current task. Our method, Osiris, which explicitly optimizes all three objectives on separate embedding spaces, achieves state-of-the-art performance on all benchmarks, including two novel benchmarks proposed in this paper featuring semantically structured task sequences. Compared to standard benchmarks, these two structured benchmarks more closely resemble visual signals received by humans and animals when navigating real-world environments. Finally, we show some preliminary evidence that continual models can benefit from such realistic learning scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.19132v2-abstract-full').style.display = 'none'; document.getElementById('2404.19132v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CoLLAs 2024 (Oral)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.04904">arXiv:2404.04904</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2404.04904">pdf</a>, <a href="https://arxiv.org/format/2404.04904">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Cross-Domain Audio Deepfake Detection: Dataset and Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yuang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+M">Min Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Mengxin Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+M">Miaomiao Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+D">Daimeng Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+H">Hao Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.04904v2-abstract-short" style="display: inline;"> Audio deepfake detection (ADD) is essential for preventing the misuse of synthetic voices that may infringe on personal rights and privacy. Recent zero-shot text-to-speech (TTS) models pose higher risks as they can clone voices with a single utterance. However, the existing ADD datasets are outdated, leading to suboptimal generalization of detection models. In this paper, we construct a new cross-&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.04904v2-abstract-full').style.display = 'inline'; document.getElementById('2404.04904v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.04904v2-abstract-full" style="display: none;"> Audio deepfake detection (ADD) is essential for preventing the misuse of synthetic voices that may infringe on personal rights and privacy. Recent zero-shot text-to-speech (TTS) models pose higher risks as they can clone voices with a single utterance. However, the existing ADD datasets are outdated, leading to suboptimal generalization of detection models. In this paper, we construct a new cross-domain ADD dataset comprising over 300 hours of speech data that is generated by five advanced zero-shot TTS models. To simulate real-world scenarios, we employ diverse attack methods and audio prompts from different datasets. Experiments show that, through novel attack-augmented training, the Wav2Vec2-large and Whisper-medium models achieve equal error rates of 4.1\% and 6.5\% respectively. Additionally, we demonstrate our models&#39; outstanding few-shot ADD ability by fine-tuning with just one minute of target-domain data. Nonetheless, neural codec compressors greatly affect the detection accuracy, necessitating further research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.04904v2-abstract-full').style.display = 'none'; document.getElementById('2404.04904v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.15362">arXiv:2403.15362</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.15362">pdf</a>, <a href="https://arxiv.org/format/2403.15362">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> CoLLEGe: Concept Embedding Generation for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Teehan%2C+R">Ryan Teehan</a>, <a href="/search/cs?searchtype=author&amp;query=Lake%2C+B">Brenden Lake</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Mengye Ren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.15362v2-abstract-short" style="display: inline;"> Current language models are unable to quickly learn new concepts on the fly, often requiring a more involved finetuning process to learn robustly. Prompting in-context is not robust to context distractions, and often fails to confer much information about the new concepts. Classic methods for few-shot word learning in NLP, relying on global word vectors, are less applicable to large language model&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.15362v2-abstract-full').style.display = 'inline'; document.getElementById('2403.15362v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.15362v2-abstract-full" style="display: none;"> Current language models are unable to quickly learn new concepts on the fly, often requiring a more involved finetuning process to learn robustly. Prompting in-context is not robust to context distractions, and often fails to confer much information about the new concepts. Classic methods for few-shot word learning in NLP, relying on global word vectors, are less applicable to large language models. In this paper, we introduce a novel approach named CoLLEGe (Concept Learning with Language Embedding Generation) to modernize few-shot concept learning. CoLLEGe is a meta-learning framework capable of generating flexible embeddings for new concepts using a small number of example sentences or definitions. Our primary meta-learning objective is simply to facilitate a language model to make next word predictions in forthcoming sentences, making it compatible with language model pretraining. We design a series of tasks to test new concept learning in challenging real-world scenarios, including new word acquisition, definition inference, and verbal reasoning, and demonstrate that our method succeeds in each setting without task-specific training. Code and data for our project can be found at https://college-concept-learning.github.io/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.15362v2-abstract-full').style.display = 'none'; document.getElementById('2403.15362v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.12987">arXiv:2403.12987</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.12987">pdf</a>, <a href="https://arxiv.org/format/2403.12987">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Rethinking Specificity in SBDD: Leveraging Delta Score and Energy-Guided Diffusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gao%2C+B">Bowen Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Minsi Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Ni%2C+Y">Yuyan Ni</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Y">Yanwen Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Qiang%2C+B">Bo Qiang</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+Z">Zhi-Ming Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+W">Wei-Ying Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+Y">Yanyan Lan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.12987v1-abstract-short" style="display: inline;"> In the field of Structure-based Drug Design (SBDD), deep learning-based generative models have achieved outstanding performance in terms of docking score. However, further study shows that the existing molecular generative methods and docking scores both have lacked consideration in terms of specificity, which means that generated molecules bind to almost every protein pocket with high affinity. T&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.12987v1-abstract-full').style.display = 'inline'; document.getElementById('2403.12987v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.12987v1-abstract-full" style="display: none;"> In the field of Structure-based Drug Design (SBDD), deep learning-based generative models have achieved outstanding performance in terms of docking score. However, further study shows that the existing molecular generative methods and docking scores both have lacked consideration in terms of specificity, which means that generated molecules bind to almost every protein pocket with high affinity. To address this, we introduce the Delta Score, a new metric for evaluating the specificity of molecular binding. To further incorporate this insight for generation, we develop an innovative energy-guided approach using contrastive learning, with active compounds as decoys, to direct generative models toward creating molecules with high specificity. Our empirical results show that this method not only enhances the delta score but also maintains or improves traditional docking scores, successfully bridging the gap between SBDD and real-world needs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.12987v1-abstract-full').style.display = 'none'; document.getElementById('2403.12987v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.09613">arXiv:2403.09613</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2403.09613">pdf</a>, <a href="https://arxiv.org/format/2403.09613">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Reawakening knowledge: Anticipatory recovery from catastrophic interference via structured training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yanlai Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Jones%2C+M">Matt Jones</a>, <a href="/search/cs?searchtype=author&amp;query=Mozer%2C+M+C">Michael C. Mozer</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Mengye Ren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.09613v1-abstract-short" style="display: inline;"> We explore the training dynamics of neural networks in a structured non-IID setting where documents are presented cyclically in a fixed, repeated sequence. Typically, networks suffer from catastrophic interference when training on a sequence of documents; however, we discover a curious and remarkable property of LLMs fine-tuned sequentially in this setting: they exhibit anticipatory behavior, reco&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.09613v1-abstract-full').style.display = 'inline'; document.getElementById('2403.09613v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.09613v1-abstract-full" style="display: none;"> We explore the training dynamics of neural networks in a structured non-IID setting where documents are presented cyclically in a fixed, repeated sequence. Typically, networks suffer from catastrophic interference when training on a sequence of documents; however, we discover a curious and remarkable property of LLMs fine-tuned sequentially in this setting: they exhibit anticipatory behavior, recovering from the forgetting on documents before encountering them again. The behavior emerges and becomes more robust as the architecture scales up its number of parameters. Through comprehensive experiments and visualizations, we uncover new insights into training over-parameterized networks in structured environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.09613v1-abstract-full').style.display = 'none'; document.getElementById('2403.09613v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 18 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.18243">arXiv:2402.18243</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.18243">pdf</a>, <a href="https://arxiv.org/format/2402.18243">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Learning or Self-aligning? Rethinking Instruction Fine-tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Mengjie Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+B">Boxi Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+H">Hongyu Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+C">Cao Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+X">Xianpei Han</a>, <a href="/search/cs?searchtype=author&amp;query=Zeng%2C+K">Ke Zeng</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+G">Guanglu Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Cai%2C+X">Xunliang Cai</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Le Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.18243v3-abstract-short" style="display: inline;"> Instruction Fine-tuning~(IFT) is a critical phase in building large language models~(LLMs). Previous works mainly focus on the IFT&#39;s role in the transfer of behavioral norms and the learning of additional world knowledge. However, the understanding of the underlying mechanisms of IFT remains significantly limited. In this paper, we design a knowledge intervention framework to decouple the potentia&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.18243v3-abstract-full').style.display = 'inline'; document.getElementById('2402.18243v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.18243v3-abstract-full" style="display: none;"> Instruction Fine-tuning~(IFT) is a critical phase in building large language models~(LLMs). Previous works mainly focus on the IFT&#39;s role in the transfer of behavioral norms and the learning of additional world knowledge. However, the understanding of the underlying mechanisms of IFT remains significantly limited. In this paper, we design a knowledge intervention framework to decouple the potential underlying factors of IFT, thereby enabling individual analysis of different factors. Surprisingly, our experiments reveal that attempting to learn additional world knowledge through IFT often struggles to yield positive impacts and can even lead to markedly negative effects. Further, we discover that maintaining internal knowledge consistency before and after IFT is a critical factor for achieving successful IFT. Our findings reveal the underlying mechanisms of IFT and provide robust support for some very recent and potential future works. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.18243v3-abstract-full').style.display = 'none'; document.getElementById('2402.18243v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Camera Ready for ACL2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.00300">arXiv:2402.00300</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2402.00300">pdf</a>, <a href="https://arxiv.org/format/2402.00300">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> </div> </div> <p class="title is-5 mathjax"> Self-supervised learning of video representations from a child&#39;s perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Orhan%2C+A+E">A. Emin Orhan</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+W">Wentao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+A+N">Alex N. Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Mengye Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Lake%2C+B+M">Brenden M. Lake</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.00300v3-abstract-short" style="display: inline;"> Children learn powerful internal models of the world around them from a few years of egocentric visual experience. Can such internal models be learned from a child&#39;s visual experience with highly generic learning algorithms or do they require strong inductive biases? Recent advances in collecting large-scale, longitudinal, developmentally realistic video datasets and generic self-supervised learni&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.00300v3-abstract-full').style.display = 'inline'; document.getElementById('2402.00300v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.00300v3-abstract-full" style="display: none;"> Children learn powerful internal models of the world around them from a few years of egocentric visual experience. Can such internal models be learned from a child&#39;s visual experience with highly generic learning algorithms or do they require strong inductive biases? Recent advances in collecting large-scale, longitudinal, developmentally realistic video datasets and generic self-supervised learning (SSL) algorithms are allowing us to begin to tackle this nature vs. nurture question. However, existing work typically focuses on image-based SSL algorithms and visual capabilities that can be learned from static images (e.g. object recognition), thus ignoring temporal aspects of the world. To close this gap, here we train self-supervised video models on longitudinal, egocentric headcam recordings collected from a child over a two year period in their early development (6-31 months). The resulting models are highly effective at facilitating the learning of action concepts from a small number of labeled examples; they have favorable data size scaling properties; and they display emergent video interpolation capabilities. Video models also learn more accurate and more robust object representations than image-based models trained with the exact same data. These results suggest that important temporal aspects of a child&#39;s internal model of the world may be learnable from their visual experience using highly generic learning algorithms and without strong inductive biases. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.00300v3-abstract-full').style.display = 'none'; document.getElementById('2402.00300v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">v3 updates results with significantly improved models; v2 was published as a conference paper at CogSci 2024; code &amp; models available from https://github.com/eminorhan/video-models</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.11839">arXiv:2401.11839</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2401.11839">pdf</a>, <a href="https://arxiv.org/format/2401.11839">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> AI for social science and social science of AI: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xu%2C+R">Ruoxi Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+Y">Yingfei Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Mengjie Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+S">Shiguang Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+R">Ruotong Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+H">Hongyu Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+L">Le Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+X">Xianpei Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.11839v1-abstract-short" style="display: inline;"> Recent advancements in artificial intelligence, particularly with the emergence of large language models (LLMs), have sparked a rethinking of artificial general intelligence possibilities. The increasing human-like capabilities of AI are also attracting attention in social science research, leading to various studies exploring the combination of these two fields. In this survey, we systematically&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.11839v1-abstract-full').style.display = 'inline'; document.getElementById('2401.11839v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.11839v1-abstract-full" style="display: none;"> Recent advancements in artificial intelligence, particularly with the emergence of large language models (LLMs), have sparked a rethinking of artificial general intelligence possibilities. The increasing human-like capabilities of AI are also attracting attention in social science research, leading to various studies exploring the combination of these two fields. In this survey, we systematically categorize previous explorations in the combination of AI and social science into two directions that share common technical approaches but differ in their research objectives. The first direction is focused on AI for social science, where AI is utilized as a powerful tool to enhance various stages of social science research. While the second direction is the social science of AI, which examines AI agents as social entities with their human-like cognitive and linguistic capabilities. By conducting a thorough review, particularly on the substantial progress facilitated by recent advancements in large language models, this paper introduces a fresh perspective to reassess the relationship between AI and social science, provides a cohesive framework that allows researchers to understand the distinctions and connections between AI for social science and social science of AI, and also summarized state-of-art experiment simulation platforms to facilitate research in these two directions. We believe that as AI technology continues to advance and intelligent agents find increasing applications in our daily lives, the significance of the combination of AI and social science will become even more prominent. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.11839v1-abstract-full').style.display = 'none'; document.getElementById('2401.11839v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Information Processing and Management (IP&amp;M)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.11382">arXiv:2401.11382</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2401.11382">pdf</a>, <a href="https://arxiv.org/format/2401.11382">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Using Large Language Model for End-to-End Chinese ASR and NER </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yuang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+J">Jiawei Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+M">Min Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Mengxin Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Y">Yanqing Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+X">Xiaofeng Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Tao%2C+S">Shimin Tao</a>, <a href="/search/cs?searchtype=author&amp;query=Su%2C+J">Jinsong Su</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+H">Hao Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.11382v2-abstract-short" style="display: inline;"> Mapping speech tokens to the same feature space as text tokens has become the paradigm for the integration of speech modality into decoder-only large language models (LLMs). An alternative approach is to use an encoder-decoder architecture that incorporates speech features through cross-attention. This approach, however, has received less attention in the literature. In this work, we connect the W&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.11382v2-abstract-full').style.display = 'inline'; document.getElementById('2401.11382v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.11382v2-abstract-full" style="display: none;"> Mapping speech tokens to the same feature space as text tokens has become the paradigm for the integration of speech modality into decoder-only large language models (LLMs). An alternative approach is to use an encoder-decoder architecture that incorporates speech features through cross-attention. This approach, however, has received less attention in the literature. In this work, we connect the Whisper encoder with ChatGLM3 and provide in-depth comparisons of these two approaches using Chinese automatic speech recognition (ASR) and name entity recognition (NER) tasks. We evaluate them not only by conventional metrics like the F1 score but also by a novel fine-grained taxonomy of ASR-NER errors. Our experiments reveal that encoder-decoder architecture outperforms decoder-only architecture with a short context, while decoder-only architecture benefits from a long context as it fully exploits all layers of the LLM. By using LLM, we significantly reduced the entity omission errors and improved the entity ASR accuracy compared to the Conformer baseline. Additionally, we obtained a state-of-the-art (SOTA) F1 score of 0.805 on the AISHELL-NER test set by using chain-of-thought (CoT) NER which first infers long-form ASR transcriptions and then predicts NER labels. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.11382v2-abstract-full').style.display = 'none'; document.getElementById('2401.11382v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 2 figures, Accepted to InterSpeech 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.12736">arXiv:2312.12736</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.12736">pdf</a>, <a href="https://arxiv.org/format/2312.12736">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Learning and Forgetting Unsafe Examples in Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+J">Jiachen Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Deng%2C+Z">Zhun Deng</a>, <a href="/search/cs?searchtype=author&amp;query=Madras%2C+D">David Madras</a>, <a href="/search/cs?searchtype=author&amp;query=Zou%2C+J">James Zou</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Mengye Ren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.12736v2-abstract-short" style="display: inline;"> As the number of large language models (LLMs) released to the public grows, there is a pressing need to understand the safety implications associated with these models learning from third-party custom finetuning data. We explore the behavior of LLMs finetuned on noisy custom data containing unsafe content, represented by datasets that contain biases, toxicity, and harmfulness, finding that while a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.12736v2-abstract-full').style.display = 'inline'; document.getElementById('2312.12736v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.12736v2-abstract-full" style="display: none;"> As the number of large language models (LLMs) released to the public grows, there is a pressing need to understand the safety implications associated with these models learning from third-party custom finetuning data. We explore the behavior of LLMs finetuned on noisy custom data containing unsafe content, represented by datasets that contain biases, toxicity, and harmfulness, finding that while aligned LLMs can readily learn this unsafe content, they also tend to forget it more significantly than other examples when subsequently finetuned on safer content. Drawing inspiration from the discrepancies in forgetting, we introduce the &#34;ForgetFilter&#34; algorithm, which filters unsafe data based on how strong the model&#39;s forgetting signal is for that data. We demonstrate that the ForgetFilter algorithm ensures safety in customized finetuning without compromising downstream task performance, unlike sequential safety finetuning. ForgetFilter outperforms alternative strategies like replay and moral self-correction in curbing LLMs&#39; ability to assimilate unsafe content during custom finetuning, e.g. 75% lower than not applying any safety measures and 62% lower than using self-correction in toxicity score. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.12736v2-abstract-full').style.display = 'none'; document.getElementById('2312.12736v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by ICML 24</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.06886">arXiv:2312.06886</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.06886">pdf</a>, <a href="https://arxiv.org/format/2312.06886">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Relightful Harmonization: Lighting-aware Portrait Background Replacement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Mengwei Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+W">Wei Xiong</a>, <a href="/search/cs?searchtype=author&amp;query=Yoon%2C+J+S">Jae Shin Yoon</a>, <a href="/search/cs?searchtype=author&amp;query=Shu%2C+Z">Zhixin Shu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jianming Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Jung%2C+H">HyunJoon Jung</a>, <a href="/search/cs?searchtype=author&amp;query=Gerig%2C+G">Guido Gerig</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">He Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.06886v2-abstract-short" style="display: inline;"> Portrait harmonization aims to composite a subject into a new background, adjusting its lighting and color to ensure harmony with the background scene. Existing harmonization techniques often only focus on adjusting the global color and brightness of the foreground and ignore crucial illumination cues from the background such as apparent lighting direction, leading to unrealistic compositions. We&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.06886v2-abstract-full').style.display = 'inline'; document.getElementById('2312.06886v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.06886v2-abstract-full" style="display: none;"> Portrait harmonization aims to composite a subject into a new background, adjusting its lighting and color to ensure harmony with the background scene. Existing harmonization techniques often only focus on adjusting the global color and brightness of the foreground and ignore crucial illumination cues from the background such as apparent lighting direction, leading to unrealistic compositions. We introduce Relightful Harmonization, a lighting-aware diffusion model designed to seamlessly harmonize sophisticated lighting effect for the foreground portrait using any background image. Our approach unfolds in three stages. First, we introduce a lighting representation module that allows our diffusion model to encode lighting information from target image background. Second, we introduce an alignment network that aligns lighting features learned from image background with lighting features learned from panorama environment maps, which is a complete representation for scene illumination. Last, to further boost the photorealism of the proposed method, we introduce a novel data simulation pipeline that generates synthetic training pairs from a diverse range of natural images, which are used to refine the model. Our method outperforms existing benchmarks in visual fidelity and lighting coherence, showing superior generalization in real-world testing scenarios, highlighting its versatility and practicality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.06886v2-abstract-full').style.display = 'none'; document.getElementById('2312.06886v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2024 camera ready</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.06168">arXiv:2312.06168</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.06168">pdf</a>, <a href="https://arxiv.org/format/2312.06168">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> A Novel Planning Framework for Complex Flipping Manipulation of Multiple Mobile Manipulators </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+W">Wenhang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Meng Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Song%2C+K">Kun Song</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+M+Y">Michael Yu Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+Z">Zhenhua Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.06168v2-abstract-short" style="display: inline;"> During complex object manipulation, manipulator systems often face the configuration disconnectivity problem due to closed-chain constraints. Although regrasping can be adopted to get a piecewise connected manipulation, it is a challenging problem to determine whether there is a planning result without regrasping. To address this problem, a novel planning framework is proposed for multiple mobile&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.06168v2-abstract-full').style.display = 'inline'; document.getElementById('2312.06168v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.06168v2-abstract-full" style="display: none;"> During complex object manipulation, manipulator systems often face the configuration disconnectivity problem due to closed-chain constraints. Although regrasping can be adopted to get a piecewise connected manipulation, it is a challenging problem to determine whether there is a planning result without regrasping. To address this problem, a novel planning framework is proposed for multiple mobile manipulator systems. Coordinated platform motions and regrasping motions are proposed to enhance configuration connectivity. Given the object trajectory and the grasping pose set, the planning framework includes three steps. First, inverse kinematics for each mobile manipulator is verified along the given trajectory based on different grasping poses. Coverable trajectory segments are determined for each robot for a specific grasping pose. Second, the trajectory choice problem is formulated into a set cover problem, by which we can quickly determine whether the manipulation can be completed without regrasping or with the minimal regrasping number. Finally, the motions of each mobile manipulator are planned with the assigned trajectory segments using existing methods. Both simulations and experimental results show the performance of the planner in complex flipping manipulation. Additionally, the proposed planner can greatly extend the adaptability of multiple mobile manipulator systems in complex manipulation tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.06168v2-abstract-full').style.display = 'none'; document.getElementById('2312.06168v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.05269">arXiv:2312.05269</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.05269">pdf</a>, <a href="https://arxiv.org/format/2312.05269">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> LifelongMemory: Leveraging LLMs for Answering Queries in Long-form Egocentric Videos </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Ying Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Y">Yanlai Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Mengye Ren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.05269v3-abstract-short" style="display: inline;"> In this paper we introduce LifelongMemory, a new framework for accessing long-form egocentric videographic memory through natural language question answering and retrieval. LifelongMemory generates concise video activity descriptions of the camera wearer and leverages the zero-shot capabilities of pretrained large language models to perform reasoning over long-form video context. Furthermore, Life&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.05269v3-abstract-full').style.display = 'inline'; document.getElementById('2312.05269v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.05269v3-abstract-full" style="display: none;"> In this paper we introduce LifelongMemory, a new framework for accessing long-form egocentric videographic memory through natural language question answering and retrieval. LifelongMemory generates concise video activity descriptions of the camera wearer and leverages the zero-shot capabilities of pretrained large language models to perform reasoning over long-form video context. Furthermore, LifelongMemory uses a confidence and explanation module to produce confident, high-quality, and interpretable answers. Our approach achieves state-of-the-art performance on the EgoSchema benchmark for question answering and is highly competitive on the natural language query (NLQ) challenge of Ego4D. Code is available at https://github.com/agentic-learning-ai-lab/lifelong-memory. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.05269v3-abstract-full').style.display = 'none'; document.getElementById('2312.05269v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2312.03214">arXiv:2312.03214</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2312.03214">pdf</a>, <a href="https://arxiv.org/format/2312.03214">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Programming Languages">cs.PL</span> </div> </div> <p class="title is-5 mathjax"> Optimistic Global Function Merger </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Lee%2C+K">Kyungwoo Lee</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Manman Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Hoag%2C+E">Ellis Hoag</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2312.03214v1-abstract-short" style="display: inline;"> Function merging is a pivotal technique for reducing code size by combining identical or similar functions into a single function. While prior research has extensively explored this technique, it has not been assessed in conjunction with function outlining and linker&#39;s identical code folding, despite substantial common ground. The traditional approaches necessitate the complete intermediate repres&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.03214v1-abstract-full').style.display = 'inline'; document.getElementById('2312.03214v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2312.03214v1-abstract-full" style="display: none;"> Function merging is a pivotal technique for reducing code size by combining identical or similar functions into a single function. While prior research has extensively explored this technique, it has not been assessed in conjunction with function outlining and linker&#39;s identical code folding, despite substantial common ground. The traditional approaches necessitate the complete intermediate representation to compare functions. Consequently, none of these approaches offer a scalable solution compatible with separate compilations while achieving global function merging, which is critical for large app development. In this paper, we introduce our global function merger, leveraging global merge information from previous code generation runs to optimistically create merging instances within each module context independently. Notably, our approach remains sound even when intermediate representations change, making it well-suited for distributed build environments. We present a comprehensive code generation framework that can run both the state-of-the-art global function outliner and our global function merger. These components complement each other, resulting in a positive impact on code size reduction. Our evaluation demonstrates that when integrating the global function merger with a state-of-the-art global function outliner that is fully optimized with ThinLTO, a further reduction of up to 3.5% in code size can be attained. This is in addition to the initial average reduction of 17.3% achieved through global function outlining for real-world iOS apps, all with minimal extra build time. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2312.03214v1-abstract-full').style.display = 'none'; document.getElementById('2312.03214v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.17218">arXiv:2311.17218</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2311.17218">pdf</a>, <a href="https://arxiv.org/format/2311.17218">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> BIM: Block-Wise Self-Supervised Learning with Masked Image Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Luo%2C+Y">Yixuan Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Mengye Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+S+Q">Sai Qian Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.17218v1-abstract-short" style="display: inline;"> Like masked language modeling (MLM) in natural language processing, masked image modeling (MIM) aims to extract valuable insights from image patches to enhance the feature extraction capabilities of the underlying deep neural network (DNN). Contrasted with other training paradigms like supervised learning and unsupervised contrastive learning, masked image modeling (MIM) pretraining typically dema&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.17218v1-abstract-full').style.display = 'inline'; document.getElementById('2311.17218v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.17218v1-abstract-full" style="display: none;"> Like masked language modeling (MLM) in natural language processing, masked image modeling (MIM) aims to extract valuable insights from image patches to enhance the feature extraction capabilities of the underlying deep neural network (DNN). Contrasted with other training paradigms like supervised learning and unsupervised contrastive learning, masked image modeling (MIM) pretraining typically demands significant computational resources in order to manage large training data batches (e.g., 4096). The significant memory and computation requirements pose a considerable challenge to its broad adoption. To mitigate this, we introduce a novel learning framework, termed~\textit{Block-Wise Masked Image Modeling} (BIM). This framework involves decomposing the MIM tasks into several sub-tasks with independent computation patterns, resulting in block-wise back-propagation operations instead of the traditional end-to-end approach. Our proposed BIM maintains superior performance compared to conventional MIM while greatly reducing peak memory consumption. Moreover, BIM naturally enables the concurrent training of numerous DNN backbones of varying depths. This leads to the creation of multiple trained DNN backbones, each tailored to different hardware platforms with distinct computing capabilities. This approach significantly reduces computational costs in comparison with training each DNN backbone individually. Our framework offers a promising solution for resource constrained training of MIM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.17218v1-abstract-full').style.display = 'none'; document.getElementById('2311.17218v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.12035">arXiv:2311.12035</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2311.12035">pdf</a>, <a href="https://arxiv.org/format/2311.12035">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Delta Score: Improving the Binding Assessment of Structure-Based Drug Design Methods </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Minsi Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+B">Bowen Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Qiang%2C+B">Bo Qiang</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+Y">Yanyan Lan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.12035v1-abstract-short" style="display: inline;"> Structure-based drug design (SBDD) stands at the forefront of drug discovery, emphasizing the creation of molecules that target specific binding pockets. Recent advances in this area have witnessed the adoption of deep generative models and geometric deep learning techniques, modeling SBDD as a conditional generation task where the target structure serves as context. Historically, evaluation of th&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.12035v1-abstract-full').style.display = 'inline'; document.getElementById('2311.12035v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.12035v1-abstract-full" style="display: none;"> Structure-based drug design (SBDD) stands at the forefront of drug discovery, emphasizing the creation of molecules that target specific binding pockets. Recent advances in this area have witnessed the adoption of deep generative models and geometric deep learning techniques, modeling SBDD as a conditional generation task where the target structure serves as context. Historically, evaluation of these models centered on docking scores, which quantitatively depict the predicted binding affinity between a molecule and its target pocket. Though state-of-the-art models purport that a majority of their generated ligands exceed the docking score of ground truth ligands in test sets, it begs the question: Do these scores align with real-world biological needs? In this paper, we introduce the delta score, a novel evaluation metric grounded in tangible pharmaceutical requisites. Our experiments reveal that molecules produced by current deep generative models significantly lag behind ground truth reference ligands when assessed with the delta score. This novel metric not only complements existing benchmarks but also provides a pivotal direction for subsequent research in the domain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.12035v1-abstract-full').style.display = 'none'; document.getElementById('2311.12035v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2311.02007">arXiv:2311.02007</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2311.02007">pdf</a>, <a href="https://arxiv.org/format/2311.02007">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Towards Unsupervised Object Detection From LiDAR Point Clouds </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Lunjun Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+A+J">Anqi Joyce Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+Y">Yuwen Xiong</a>, <a href="/search/cs?searchtype=author&amp;query=Casas%2C+S">Sergio Casas</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+B">Bin Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Mengye Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Urtasun%2C+R">Raquel Urtasun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2311.02007v1-abstract-short" style="display: inline;"> In this paper, we study the problem of unsupervised object detection from 3D point clouds in self-driving scenes. We present a simple yet effective method that exploits (i) point clustering in near-range areas where the point clouds are dense, (ii) temporal consistency to filter out noisy unsupervised detections, (iii) translation equivariance of CNNs to extend the auto-labels to long range, and (&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.02007v1-abstract-full').style.display = 'inline'; document.getElementById('2311.02007v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2311.02007v1-abstract-full" style="display: none;"> In this paper, we study the problem of unsupervised object detection from 3D point clouds in self-driving scenes. We present a simple yet effective method that exploits (i) point clustering in near-range areas where the point clouds are dense, (ii) temporal consistency to filter out noisy unsupervised detections, (iii) translation equivariance of CNNs to extend the auto-labels to long range, and (iv) self-supervision for improving on its own. Our approach, OYSTER (Object Discovery via Spatio-Temporal Refinement), does not impose constraints on data collection (such as repeated traversals of the same location), is able to detect objects in a zero-shot manner without supervised finetuning (even in sparse, distant regions), and continues to self-improve given more rounds of iterative self-training. To better measure model performance in self-driving scenarios, we propose a new planning-centric perception metric based on distance-to-collision. We demonstrate that our unsupervised object detector significantly outperforms unsupervised baselines on PandaSet and Argoverse 2 Sensor dataset, showing promise that self-supervision combined with object priors can enable object discovery in the wild. For more information, visit the project website: https://waabi.ai/research/oyster <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2311.02007v1-abstract-full').style.display = 'none'; document.getElementById('2311.02007v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CVPR 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.19849">arXiv:2310.19849</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2310.19849">pdf</a>, <a href="https://arxiv.org/format/2310.19849">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> </div> </div> <p class="title is-5 mathjax"> Predicting mutational effects on protein-protein binding via a side-chain diffusion probabilistic model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+S">Shiwei Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+T">Tian Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Milong Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+C">Chungong Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Bu%2C+D">Dongbo Bu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Haicang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.19849v1-abstract-short" style="display: inline;"> Many crucial biological processes rely on networks of protein-protein interactions. Predicting the effect of amino acid mutations on protein-protein binding is vital in protein engineering and therapeutic discovery. However, the scarcity of annotated experimental data on binding energy poses a significant challenge for developing computational approaches, particularly deep learning-based methods.&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.19849v1-abstract-full').style.display = 'inline'; document.getElementById('2310.19849v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.19849v1-abstract-full" style="display: none;"> Many crucial biological processes rely on networks of protein-protein interactions. Predicting the effect of amino acid mutations on protein-protein binding is vital in protein engineering and therapeutic discovery. However, the scarcity of annotated experimental data on binding energy poses a significant challenge for developing computational approaches, particularly deep learning-based methods. In this work, we propose SidechainDiff, a representation learning-based approach that leverages unlabelled experimental protein structures. SidechainDiff utilizes a Riemannian diffusion model to learn the generative process of side-chain conformations and can also give the structural context representations of mutations on the protein-protein interface. Leveraging the learned representations, we achieve state-of-the-art performance in predicting the mutational effects on protein-protein binding. Furthermore, SidechainDiff is the first diffusion-based generative model for side-chains, distinguishing it from prior efforts that have predominantly focused on generating protein backbone structures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.19849v1-abstract-full').style.display = 'none'; document.getElementById('2310.19849v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.10239">arXiv:2310.10239</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2310.10239">pdf</a>, <a href="https://arxiv.org/format/2310.10239">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Methodology">stat.ME</span> </div> </div> <p class="title is-5 mathjax"> Structural transfer learning of non-Gaussian DAG </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Mingyang Ren</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+X">Xin He</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Junhui Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.10239v1-abstract-short" style="display: inline;"> Directed acyclic graph (DAG) has been widely employed to represent directional relationships among a set of collected nodes. Yet, the available data in one single study is often limited for accurate DAG reconstruction, whereas heterogeneous data may be collected from multiple relevant studies. It remains an open question how to pool the heterogeneous data together for better DAG structure reconstr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.10239v1-abstract-full').style.display = 'inline'; document.getElementById('2310.10239v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.10239v1-abstract-full" style="display: none;"> Directed acyclic graph (DAG) has been widely employed to represent directional relationships among a set of collected nodes. Yet, the available data in one single study is often limited for accurate DAG reconstruction, whereas heterogeneous data may be collected from multiple relevant studies. It remains an open question how to pool the heterogeneous data together for better DAG structure reconstruction in the target study. In this paper, we first introduce a novel set of structural similarity measures for DAG and then present a transfer DAG learning framework by effectively leveraging information from auxiliary DAGs of different levels of similarities. Our theoretical analysis shows substantial improvement in terms of DAG reconstruction in the target study, even when no auxiliary DAG is overall similar to the target DAG, which is in sharp contrast to most existing transfer learning methods. The advantage of the proposed transfer DAG learning is also supported by extensive numerical experiments on both synthetic data and multi-site brain functional connectivity network data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.10239v1-abstract-full').style.display = 'none'; document.getElementById('2310.10239v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">35 pages, 3 figures, 3 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.07846">arXiv:2310.07846</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2310.07846">pdf</a>, <a href="https://arxiv.org/format/2310.07846">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> DAG-aware Synthesis Orchestration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yingjie Li</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+M">Mingju Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Mark Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Mishchenko%2C+A">Alan Mishchenko</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+C">Cunxi Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.07846v2-abstract-short" style="display: inline;"> The key methodologies of modern logic synthesis techniques are conducted on multi-level technology-independent representations such as And-Inverter-Graphs (AIGs) of the digital logic via directed-acyclic-graph (DAGs) traversal based structural rewriting, resubstitution, and refactoring. Existing state-of-the-art DAG-aware logic synthesis algorithms are all designed to perform stand-alone optimizat&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.07846v2-abstract-full').style.display = 'inline'; document.getElementById('2310.07846v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.07846v2-abstract-full" style="display: none;"> The key methodologies of modern logic synthesis techniques are conducted on multi-level technology-independent representations such as And-Inverter-Graphs (AIGs) of the digital logic via directed-acyclic-graph (DAGs) traversal based structural rewriting, resubstitution, and refactoring. Existing state-of-the-art DAG-aware logic synthesis algorithms are all designed to perform stand-alone optimizations during a single DAG traversal. However, we empirically identify and demonstrate that these algorithms are limited in quality-of-results and runtime complexity due to this design concept. This work proposes Synthesis Orchestration, which orchestrates stand-alone operations within the single traversal of AIG. Thus, orchestration method explores more optimization opportunities and results in better performance. Our experimental results are comprehensively conducted on all 104 designs collected from ISCAS&#39;85/89/99, VTR, and EPFL benchmark suites, with consistent logic minimization improvements over rewriting, resubstitution, refactoring, leading to an average of 4% more node reduction with improved runtime efficiency for the single optimization. Moreover, we evaluate orchestration as a plug-in algorithm in resyn and resyn3 flows in ABC, which demonstrates consistent logic minimization improvements (3.8% and 10.9% more node reduction on average). The runtime analysis demonstrates the orchestration outperforms stand-alone algorithms in both AIG minimization and runtime efficiency. Finally, we integrate the orchestration into OpenROAD for end-to-end performance evaluation. Our results demonstrate the advantages of the orchestration optimization technique, even after technology mapping and post-routing in the design flow have been conducted. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.07846v2-abstract-full').style.display = 'none'; document.getElementById('2310.07846v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems (TCAD), 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.06367">arXiv:2310.06367</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2310.06367">pdf</a>, <a href="https://arxiv.org/format/2310.06367">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DrugCLIP: Contrastive Protein-Molecule Representation Learning for Virtual Screening </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Gao%2C+B">Bowen Gao</a>, <a href="/search/cs?searchtype=author&amp;query=Qiang%2C+B">Bo Qiang</a>, <a href="/search/cs?searchtype=author&amp;query=Tan%2C+H">Haichuan Tan</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Minsi Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Jia%2C+Y">Yinjun Jia</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+M">Minsi Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+J">Jingjing Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+W">Weiying Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+Y">Yanyan Lan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.06367v1-abstract-short" style="display: inline;"> Virtual screening, which identifies potential drugs from vast compound databases to bind with a particular protein pocket, is a critical step in AI-assisted drug discovery. Traditional docking methods are highly time-consuming, and can only work with a restricted search library in real-life applications. Recent supervised learning approaches using scoring functions for binding-affinity prediction,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.06367v1-abstract-full').style.display = 'inline'; document.getElementById('2310.06367v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.06367v1-abstract-full" style="display: none;"> Virtual screening, which identifies potential drugs from vast compound databases to bind with a particular protein pocket, is a critical step in AI-assisted drug discovery. Traditional docking methods are highly time-consuming, and can only work with a restricted search library in real-life applications. Recent supervised learning approaches using scoring functions for binding-affinity prediction, although promising, have not yet surpassed docking methods due to their strong dependency on limited data with reliable binding-affinity labels. In this paper, we propose a novel contrastive learning framework, DrugCLIP, by reformulating virtual screening as a dense retrieval task and employing contrastive learning to align representations of binding protein pockets and molecules from a large quantity of pairwise data without explicit binding-affinity scores. We also introduce a biological-knowledge inspired data augmentation strategy to learn better protein-molecule representations. Extensive experiments show that DrugCLIP significantly outperforms traditional docking and supervised learning methods on diverse virtual screening benchmarks with highly reduced computation time, especially in zero-shot setting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.06367v1-abstract-full').style.display = 'none'; document.getElementById('2310.06367v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.06283">arXiv:2310.06283</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2310.06283">pdf</a>, <a href="https://arxiv.org/format/2310.06283">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards More Efficient Depression Risk Recognition via Gait </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Min Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Tao%2C+M">Muchan Tao</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+X">Xuecai Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xiaotong Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Q">Qiong Li</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Y">Yongzhen Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.06283v1-abstract-short" style="display: inline;"> Depression, a highly prevalent mental illness, affects over 280 million individuals worldwide. Early detection and timely intervention are crucial for promoting remission, preventing relapse, and alleviating the emotional and financial burdens associated with depression. However, patients with depression often go undiagnosed in the primary care setting. Unlike many physiological illnesses, depress&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.06283v1-abstract-full').style.display = 'inline'; document.getElementById('2310.06283v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.06283v1-abstract-full" style="display: none;"> Depression, a highly prevalent mental illness, affects over 280 million individuals worldwide. Early detection and timely intervention are crucial for promoting remission, preventing relapse, and alleviating the emotional and financial burdens associated with depression. However, patients with depression often go undiagnosed in the primary care setting. Unlike many physiological illnesses, depression lacks objective indicators for recognizing depression risk, and existing methods for depression risk recognition are time-consuming and often encounter a shortage of trained medical professionals. The correlation between gait and depression risk has been empirically established. Gait can serve as a promising objective biomarker, offering the advantage of efficient and convenient data collection. However, current methods for recognizing depression risk based on gait have only been validated on small, private datasets, lacking large-scale publicly available datasets for research purposes. Additionally, these methods are primarily limited to hand-crafted approaches. Gait is a complex form of motion, and hand-crafted gait features often only capture a fraction of the intricate associations between gait and depression risk. Therefore, this study first constructs a large-scale gait database, encompassing over 1,200 individuals, 40,000 gait sequences, and covering six perspectives and three types of attire. Two commonly used psychological scales are provided as depression risk annotations. Subsequently, a deep learning-based depression risk recognition model is proposed, overcoming the limitations of hand-crafted approaches. Through experiments conducted on the constructed large-scale database, the effectiveness of the proposed method is validated, and numerous instructive insights are presented in the paper, highlighting the significant potential of gait-based depression risk recognition. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.06283v1-abstract-full').style.display = 'none'; document.getElementById('2310.06283v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2310.01680">arXiv:2310.01680</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2310.01680">pdf</a>, <a href="https://arxiv.org/format/2310.01680">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Keypoint-Augmented Self-Supervised Learning for Medical Image Segmentation with Limited Annotation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Z">Zhangsihao Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Mengwei Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Ding%2C+K">Kaize Ding</a>, <a href="/search/cs?searchtype=author&amp;query=Gerig%2C+G">Guido Gerig</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yalin Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2310.01680v2-abstract-short" style="display: inline;"> Pretraining CNN models (i.e., UNet) through self-supervision has become a powerful approach to facilitate medical image segmentation under low annotation regimes. Recent contrastive learning methods encourage similar global representations when the same image undergoes different transformations, or enforce invariance across different image/patch features that are intrinsically correlated. However,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.01680v2-abstract-full').style.display = 'inline'; document.getElementById('2310.01680v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2310.01680v2-abstract-full" style="display: none;"> Pretraining CNN models (i.e., UNet) through self-supervision has become a powerful approach to facilitate medical image segmentation under low annotation regimes. Recent contrastive learning methods encourage similar global representations when the same image undergoes different transformations, or enforce invariance across different image/patch features that are intrinsically correlated. However, CNN-extracted global and local features are limited in capturing long-range spatial dependencies that are essential in biological anatomy. To this end, we present a keypoint-augmented fusion layer that extracts representations preserving both short- and long-range self-attention. In particular, we augment the CNN feature map at multiple scales by incorporating an additional input that learns long-range spatial self-attention among localized keypoint features. Further, we introduce both global and local self-supervised pretraining for the framework. At the global scale, we obtain global representations from both the bottleneck of the UNet, and by aggregating multiscale keypoint features. These global features are subsequently regularized through image-level contrastive objectives. At the local scale, we define a distance-based criterion to first establish correspondences among keypoints and encourage similarity between their features. Through extensive experiments on both MRI and CT segmentation tasks, we demonstrate the architectural advantages of our proposed method in comparison to both CNN and Transformer-based UNets, when all architectures are trained with randomly initialized weights. With our proposed pretraining strategy, our method further outperforms existing SSL methods by producing more robust self-attention and achieving state-of-the-art segmentation results. The code is available at https://github.com/zshyang/kaf.git. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2310.01680v2-abstract-full').style.display = 'none'; document.getElementById('2310.01680v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 October, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Camera ready for NeurIPS 2023. Code available at https://github.com/zshyang/kaf.git</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.09552">arXiv:2309.09552</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2309.09552">pdf</a>, <a href="https://arxiv.org/format/2309.09552">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> A Multitask Training Approach to Enhance Whisper with Contextual Biasing and Open-Vocabulary Keyword Spotting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yuang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+M">Min Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Su%2C+C">Chang Su</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yinglu Li</a>, <a href="/search/cs?searchtype=author&amp;query=Qiao%2C+X">Xiaosong Qiao</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Mengxin Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+M">Miaomiao Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Wei%2C+D">Daimeng Wei</a>, <a href="/search/cs?searchtype=author&amp;query=Tao%2C+S">Shimin Tao</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+H">Hao Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.09552v4-abstract-short" style="display: inline;"> The recognition of rare named entities, such as personal names and terminologies, is challenging for automatic speech recognition (ASR) systems, especially when they are not frequently observed in the training data. In this paper, we introduce keyword spotting enhanced Whisper (KWS-Whisper), a novel ASR system that leverages the Whisper model and performs open-vocabulary keyword spotting (OV-KWS)&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.09552v4-abstract-full').style.display = 'inline'; document.getElementById('2309.09552v4-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.09552v4-abstract-full" style="display: none;"> The recognition of rare named entities, such as personal names and terminologies, is challenging for automatic speech recognition (ASR) systems, especially when they are not frequently observed in the training data. In this paper, we introduce keyword spotting enhanced Whisper (KWS-Whisper), a novel ASR system that leverages the Whisper model and performs open-vocabulary keyword spotting (OV-KWS) on the hidden states of the Whisper encoder to recognize user-defined named entities. These entities serve as prompts for the Whisper decoder. To optimize the model, we propose a multitask training approach that learns OV-KWS and contextual-ASR tasks. We evaluate our approach on Chinese Aishell hot word subsets and two internal code-switching test sets and show that it significantly improves the entity recall compared to the original Whisper model. Moreover, we demonstrate that the OV-KWS can be a plug-and-play module to enhance the ASR error correction methods and frozen Whisper models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.09552v4-abstract-full').style.display = 'none'; document.getElementById('2309.09552v4-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 September, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 2 figures, Accepted to InterSpeech 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.00180">arXiv:2309.00180</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2309.00180">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Exploring the law of text geographic information </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zhenhua Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+D">Daiyu Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Ming Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+G">Guang Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.00180v1-abstract-short" style="display: inline;"> Textual geographic information is indispensable and heavily relied upon in practical applications. The absence of clear distribution poses challenges in effectively harnessing geographic information, thereby driving our quest for exploration. We contend that geographic information is influenced by human behavior, cognition, expression, and thought processes, and given our intuitive understanding o&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.00180v1-abstract-full').style.display = 'inline'; document.getElementById('2309.00180v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.00180v1-abstract-full" style="display: none;"> Textual geographic information is indispensable and heavily relied upon in practical applications. The absence of clear distribution poses challenges in effectively harnessing geographic information, thereby driving our quest for exploration. We contend that geographic information is influenced by human behavior, cognition, expression, and thought processes, and given our intuitive understanding of natural systems, we hypothesize its conformity to the Gamma distribution. Through rigorous experiments on a diverse range of 24 datasets encompassing different languages and types, we have substantiated this hypothesis, unearthing the underlying regularities governing the dimensions of quantity, length, and distance in geographic information. Furthermore, theoretical analyses and comparisons with Gaussian distributions and Zipf&#39;s law have refuted the contingency of these laws. Significantly, we have estimated the upper bounds of human utilization of geographic information, pointing towards the existence of uncharted territories. Also, we provide guidance in geographic information extraction. Hope we peer its true countenance uncovering the veil of geographic information. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.00180v1-abstract-full').style.display = 'none'; document.getElementById('2309.00180v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IPM</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2309.00178">arXiv:2309.00178</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2309.00178">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Will Sentiment Analysis Need Subculture? A New Data Augmentation Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zhenhua Wang</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+S">Simin He</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+G">Guang Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Ming Ren</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2309.00178v1-abstract-short" style="display: inline;"> The renowned proverb that &#34;The pen is mightier than the sword&#34; underscores the formidable influence wielded by text expressions in shaping sentiments. Indeed, well-crafted written can deeply resonate within cultures, conveying profound sentiments. Nowadays, the omnipresence of the Internet has fostered a subculture that congregates around the contemporary milieu. The subculture artfully articulate&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.00178v1-abstract-full').style.display = 'inline'; document.getElementById('2309.00178v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2309.00178v1-abstract-full" style="display: none;"> The renowned proverb that &#34;The pen is mightier than the sword&#34; underscores the formidable influence wielded by text expressions in shaping sentiments. Indeed, well-crafted written can deeply resonate within cultures, conveying profound sentiments. Nowadays, the omnipresence of the Internet has fostered a subculture that congregates around the contemporary milieu. The subculture artfully articulates the intricacies of human feelings by ardently pursuing the allure of novelty, a fact that cannot be disregarded in the sentiment analysis. This paper strives to enrich data through the lens of subculture, to address the insufficient training data faced by sentiment analysis. To this end, a new approach of subculture-based data augmentation (SCDA) is proposed, which engenders six enhanced texts for each training text by leveraging the creation of six diverse subculture expression generators. The extensive experiments attest to the effectiveness and potential of SCDA. The results also shed light on the phenomenon that disparate subculture expressions elicit varying degrees of sentiment stimulation. Moreover, an intriguing conjecture arises, suggesting the linear reversibility of certain subculture expressions. It is our fervent aspiration that this study serves as a catalyst in fostering heightened perceptiveness towards the tapestry of information, sentiment and culture, thereby enriching our collective understanding. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2309.00178v1-abstract-full').style.display = 'none'; document.getElementById('2309.00178v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">JASIST</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.14024">arXiv:2308.14024</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2308.14024">pdf</a>, <a href="https://arxiv.org/format/2308.14024">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Balanced Representation Learning for Long-tailed Skeleton-based Action Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+H">Hongda Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yunlong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Min Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+J">Junxing Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+Z">Zhengquan Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+G">Guangqi Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+Z">Zhenan Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.14024v1-abstract-short" style="display: inline;"> Skeleton-based action recognition has recently made significant progress. However, data imbalance is still a great challenge in real-world scenarios. The performance of current action recognition algorithms declines sharply when training data suffers from heavy class imbalance. The imbalanced data actually degrades the representations learned by these methods and becomes the bottleneck for action&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.14024v1-abstract-full').style.display = 'inline'; document.getElementById('2308.14024v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.14024v1-abstract-full" style="display: none;"> Skeleton-based action recognition has recently made significant progress. However, data imbalance is still a great challenge in real-world scenarios. The performance of current action recognition algorithms declines sharply when training data suffers from heavy class imbalance. The imbalanced data actually degrades the representations learned by these methods and becomes the bottleneck for action recognition. How to learn unbiased representations from imbalanced action data is the key to long-tailed action recognition. In this paper, we propose a novel balanced representation learning method to address the long-tailed problem in action recognition. Firstly, a spatial-temporal action exploration strategy is presented to expand the sample space effectively, generating more valuable samples in a rebalanced manner. Secondly, we design a detached action-aware learning schedule to further mitigate the bias in the representation space. The schedule detaches the representation learning of tail classes from training and proposes an action-aware loss to impose more effective constraints. Additionally, a skip-modal representation is proposed to provide complementary structural information. The proposed method is validated on four skeleton datasets, NTU RGB+D 60, NTU RGB+D 120, NW-UCLA, and Kinetics. It not only achieves consistently large improvement compared to the state-of-the-art (SOTA) methods, but also demonstrates a superior generalization capacity through extensive experiments. Our code is available at https://github.com/firework8/BRL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.14024v1-abstract-full').style.display = 'none'; document.getElementById('2308.14024v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2308.09835">arXiv:2308.09835</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2308.09835">pdf</a>, <a href="https://arxiv.org/format/2308.09835">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Microscopy Image Segmentation via Point and Shape Regularized Data Synthesis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+S">Shijie Li</a>, <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Mengwei Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Ach%2C+T">Thomas Ach</a>, <a href="/search/cs?searchtype=author&amp;query=Gerig%2C+G">Guido Gerig</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2308.09835v3-abstract-short" style="display: inline;"> Current deep learning-based approaches for the segmentation of microscopy images heavily rely on large amount of training data with dense annotation, which is highly costly and laborious in practice. Compared to full annotation where the complete contour of objects is depicted, point annotations, specifically object centroids, are much easier to acquire and still provide crucial information about&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.09835v3-abstract-full').style.display = 'inline'; document.getElementById('2308.09835v3-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2308.09835v3-abstract-full" style="display: none;"> Current deep learning-based approaches for the segmentation of microscopy images heavily rely on large amount of training data with dense annotation, which is highly costly and laborious in practice. Compared to full annotation where the complete contour of objects is depicted, point annotations, specifically object centroids, are much easier to acquire and still provide crucial information about the objects for subsequent segmentation. In this paper, we assume access to point annotations only during training and develop a unified pipeline for microscopy image segmentation using synthetically generated training data. Our framework includes three stages: (1) it takes point annotations and samples a pseudo dense segmentation mask constrained with shape priors; (2) with an image generative model trained in an unpaired manner, it translates the mask to a realistic microscopy image regularized by object level consistency; (3) the pseudo masks along with the synthetic images then constitute a pairwise dataset for training an ad-hoc segmentation model. On the public MoNuSeg dataset, our synthesis pipeline produces more diverse and realistic images than baseline models while maintaining high coherence between input masks and generated images. When using the identical segmentation backbones, the models trained on our synthetic dataset significantly outperform those trained with pseudo-labels or baseline-generated images. Moreover, our framework achieves comparable results to models trained on authentic microscopy images with dense labels, demonstrating its potential as a reliable and highly efficient alternative to labor-intensive manual pixel-wise annotations in microscopy image segmentation. The code is available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2308.09835v3-abstract-full').style.display = 'none'; document.getElementById('2308.09835v3-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 December, 2023; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 August, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by The 3rd MICCAI Workshop on Data Augmentation, Labeling, and Imperfections</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2307.14617">arXiv:2307.14617</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2307.14617">pdf</a>, <a href="https://arxiv.org/format/2307.14617">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Multiscale Dynamic Graph Representation for Biometric Recognition with Occlusions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ren%2C+M">Min Ren</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yunlong Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+Y">Yuhao Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+K">Kunbo Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+Z">Zhenan Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2307.14617v1-abstract-short" style="display: inline;"> Occlusion is a common problem with biometric recognition in the wild. The generalization ability of CNNs greatly decreases due to the adverse effects of various occlusions. To this end, we propose a novel unified framework integrating the merits of both CNNs and graph models to overcome occlusion problems in biometric recognition, called multiscale dynamic graph representation (MS-DGR). More speci&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.14617v1-abstract-full').style.display = 'inline'; document.getElementById('2307.14617v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2307.14617v1-abstract-full" style="display: none;"> Occlusion is a common problem with biometric recognition in the wild. The generalization ability of CNNs greatly decreases due to the adverse effects of various occlusions. To this end, we propose a novel unified framework integrating the merits of both CNNs and graph models to overcome occlusion problems in biometric recognition, called multiscale dynamic graph representation (MS-DGR). More specifically, a group of deep features reflected on certain subregions is recrafted into a feature graph (FG). Each node inside the FG is deemed to characterize a specific local region of the input sample, and the edges imply the co-occurrence of non-occluded regions. By analyzing the similarities of the node representations and measuring the topological structures stored in the adjacent matrix, the proposed framework leverages dynamic graph matching to judiciously discard the nodes corresponding to the occluded parts. The multiscale strategy is further incorporated to attain more diverse nodes representing regions of various sizes. Furthermore, the proposed framework exhibits a more illustrative and reasonable inference by showing the paired nodes. Extensive experiments demonstrate the superiority of the proposed framework, which boosts the accuracy in both natural and occlusion-simulated cases by a large margin compared with that of baseline methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2307.14617v1-abstract-full').style.display = 'none'; document.getElementById('2307.14617v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 July, 2023; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2023. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by TPAMI</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Ren%2C+M&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Ren%2C+M&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Ren%2C+M&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Ren%2C+M&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10