Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 1,187 results for author: <span class="mathjax">Chen, D</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Chen%2C+D">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Chen, D"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Chen%2C+D&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Chen, D"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Chen%2C+D&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Chen%2C+D&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+D&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+D&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+D&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+D&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17188">arXiv:2411.17188</a> <span> [<a href="https://arxiv.org/pdf/2411.17188">pdf</a>, <a href="https://arxiv.org/format/2411.17188">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Interleaved Scene Graph for Interleaved Text-and-Image Generation Assessment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+D">Dongping Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+R">Ruoxi Chen</a>, <a href="/search/cs?searchtype=author&query=Pu%2C+S">Shu Pu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhaoyi Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yanru Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Caixi Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Benlin Liu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yue Huang</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+Y">Yao Wan</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+P">Pan Zhou</a>, <a href="/search/cs?searchtype=author&query=Krishna%2C+R">Ranjay Krishna</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17188v1-abstract-short" style="display: inline;"> Many real-world user queries (e.g. "How do to make egg fried rice?") could benefit from systems capable of generating responses with both textual steps with accompanying images, similar to a cookbook. Models designed to generate interleaved text and images face challenges in ensuring consistency within and across these modalities. To address these challenges, we present ISG, a comprehensive evalua… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17188v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17188v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17188v1-abstract-full" style="display: none;"> Many real-world user queries (e.g. "How do to make egg fried rice?") could benefit from systems capable of generating responses with both textual steps with accompanying images, similar to a cookbook. Models designed to generate interleaved text and images face challenges in ensuring consistency within and across these modalities. To address these challenges, we present ISG, a comprehensive evaluation framework for interleaved text-and-image generation. ISG leverages a scene graph structure to capture relationships between text and image blocks, evaluating responses on four levels of granularity: holistic, structural, block-level, and image-specific. This multi-tiered evaluation allows for a nuanced assessment of consistency, coherence, and accuracy, and provides interpretable question-answer feedback. In conjunction with ISG, we introduce a benchmark, ISG-Bench, encompassing 1,150 samples across 8 categories and 21 subcategories. This benchmark dataset includes complex language-vision dependencies and golden answers to evaluate models effectively on vision-centric tasks such as style transfer, a challenging area for current models. Using ISG-Bench, we demonstrate that recent unified vision-language models perform poorly on generating interleaved content. While compositional approaches that combine separate language and image models show a 111% improvement over unified models at the holistic level, their performance remains suboptimal at both block and image levels. To facilitate future work, we develop ISG-Agent, a baseline agent employing a "plan-execute-refine" pipeline to invoke tools, achieving a 122% performance improvement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17188v1-abstract-full').style.display = 'none'; document.getElementById('2411.17188v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16763">arXiv:2411.16763</a> <span> [<a href="https://arxiv.org/pdf/2411.16763">pdf</a>, <a href="https://arxiv.org/format/2411.16763">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Hide in Plain Sight: Clean-Label Backdoor for Auditing Membership Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+D">Depeng Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hao Chen</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+H">Hulin Jin</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+J">Jie Cui</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+H">Hong Zhong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16763v1-abstract-short" style="display: inline;"> Membership inference attacks (MIAs) are critical tools for assessing privacy risks and ensuring compliance with regulations like the General Data Protection Regulation (GDPR). However, their potential for auditing unauthorized use of data remains under explored. To bridge this gap, we propose a novel clean-label backdoor-based approach for MIAs, designed specifically for robust and stealthy data a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16763v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16763v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16763v1-abstract-full" style="display: none;"> Membership inference attacks (MIAs) are critical tools for assessing privacy risks and ensuring compliance with regulations like the General Data Protection Regulation (GDPR). However, their potential for auditing unauthorized use of data remains under explored. To bridge this gap, we propose a novel clean-label backdoor-based approach for MIAs, designed specifically for robust and stealthy data auditing. Unlike conventional methods that rely on detectable poisoned samples with altered labels, our approach retains natural labels, enhancing stealthiness even at low poisoning rates. Our approach employs an optimal trigger generated by a shadow model that mimics the target model's behavior. This design minimizes the feature-space distance between triggered samples and the source class while preserving the original data labels. The result is a powerful and undetectable auditing mechanism that overcomes limitations of existing approaches, such as label inconsistencies and visual artifacts in poisoned samples. The proposed method enables robust data auditing through black-box access, achieving high attack success rates across diverse datasets and model architectures. Additionally, it addresses challenges related to trigger stealthiness and poisoning durability, establishing itself as a practical and effective solution for data auditing. Comprehensive experiments validate the efficacy and generalizability of our approach, outperforming several baseline methods in both stealth and attack success metrics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16763v1-abstract-full').style.display = 'none'; document.getElementById('2411.16763v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16277">arXiv:2411.16277</a> <span> [<a href="https://arxiv.org/pdf/2411.16277">pdf</a>, <a href="https://arxiv.org/format/2411.16277">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="General Economics">econ.GN</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Finance">q-fin.CP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> FinML-Chain: A Blockchain-Integrated Dataset for Enhanced Financial Machine Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jingfeng Chen</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+W">Wanlin Deng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Dangxing Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Luyao Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16277v1-abstract-short" style="display: inline;"> Machine learning is critical for innovation and efficiency in financial markets, offering predictive models and data-driven decision-making. However, challenges such as missing data, lack of transparency, untimely updates, insecurity, and incompatible data sources limit its effectiveness. Blockchain technology, with its transparency, immutability, and real-time updates, addresses these challenges.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16277v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16277v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16277v1-abstract-full" style="display: none;"> Machine learning is critical for innovation and efficiency in financial markets, offering predictive models and data-driven decision-making. However, challenges such as missing data, lack of transparency, untimely updates, insecurity, and incompatible data sources limit its effectiveness. Blockchain technology, with its transparency, immutability, and real-time updates, addresses these challenges. We present a framework for integrating high-frequency on-chain data with low-frequency off-chain data, providing a benchmark for addressing novel research questions in economic mechanism design. This framework generates modular, extensible datasets for analyzing economic mechanisms such as the Transaction Fee Mechanism, enabling multi-modal insights and fairness-driven evaluations. Using four machine learning techniques, including linear regression, deep neural networks, XGBoost, and LSTM models, we demonstrate the framework's ability to produce datasets that advance financial research and improve understanding of blockchain-driven systems. Our contributions include: (1) proposing a research scenario for the Transaction Fee Mechanism and demonstrating how the framework addresses previously unexplored questions in economic mechanism design; (2) providing a benchmark for financial machine learning by open-sourcing a sample dataset generated by the framework and the code for the pipeline, enabling continuous dataset expansion; and (3) promoting reproducibility, transparency, and collaboration by fully open-sourcing the framework and its outputs. This initiative supports researchers in extending our work and developing innovative financial machine-learning models, fostering advancements at the intersection of machine learning, blockchain, and economics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16277v1-abstract-full').style.display = 'none'; document.getElementById('2411.16277v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16193">arXiv:2411.16193</a> <span> [<a href="https://arxiv.org/pdf/2411.16193">pdf</a>, <a href="https://arxiv.org/ps/2411.16193">ps</a>, <a href="https://arxiv.org/format/2411.16193">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> The Critical Canvas--How to regain information autonomy in the AI era </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+D">Dong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16193v1-abstract-short" style="display: inline;"> In the era of AI, recommendation algorithms and generative AI challenge information autonomy by creating echo chambers and blurring the line between authentic and fabricated content. The Critical Canvas addresses these challenges with a novel information exploration platform designed to restore balance between algorithmic efficiency and human agency. It employs three key mechanisms: multi-dimensio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16193v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16193v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16193v1-abstract-full" style="display: none;"> In the era of AI, recommendation algorithms and generative AI challenge information autonomy by creating echo chambers and blurring the line between authentic and fabricated content. The Critical Canvas addresses these challenges with a novel information exploration platform designed to restore balance between algorithmic efficiency and human agency. It employs three key mechanisms: multi-dimensional exploration across logical, temporal, and geographical perspectives; dynamic knowledge entry generation to capture complex relationships between concepts; and a phase space to evaluate the credibility of both the content and its sources. Particularly relevant to technical AI governance, where stakeholders must navigate intricate specifications and safety frameworks, the platform transforms overwhelming technical information into actionable insights. The Critical Canvas empowers users to regain autonomy over their information consumption through structured yet flexible exploration pathways, creative visualization, human-centric navigation, and transparent source evaluation. It fosters a comprehensive understanding of nuanced topics, enabling more informed decision-making and effective policy development in the age of AI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16193v1-abstract-full').style.display = 'none'; document.getElementById('2411.16193v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16097">arXiv:2411.16097</a> <span> [<a href="https://arxiv.org/pdf/2411.16097">pdf</a>, <a href="https://arxiv.org/format/2411.16097">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Auto-calibrated Wearable System for Load Vertical Location Estimation During Manual Lifting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+D">Diliang Chen</a>, <a href="/search/cs?searchtype=author&query=Ghoreishi%2C+N">Nozhan Ghoreishi</a>, <a href="/search/cs?searchtype=author&query=LaCourse%2C+J">John LaCourse</a>, <a href="/search/cs?searchtype=author&query=Arthanat%2C+S">Sajay Arthanat</a>, <a href="/search/cs?searchtype=author&query=LaRoche%2C+D">Dain LaRoche</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16097v1-abstract-short" style="display: inline;"> Lifting during manual material handling is a major cause of low-back pain (LBP). As an important risk factor that directly influences the risk of LBP, the Load vertical location (LVL) during lifting needs to be measured and controlled. However, existing solutions for LVL measurement are inefficient, inaccurate, and impractical for real-world workplace environments. To address these problems, an un… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16097v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16097v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16097v1-abstract-full" style="display: none;"> Lifting during manual material handling is a major cause of low-back pain (LBP). As an important risk factor that directly influences the risk of LBP, the Load vertical location (LVL) during lifting needs to be measured and controlled. However, existing solutions for LVL measurement are inefficient, inaccurate, and impractical for real-world workplace environments. To address these problems, an unobtrusive wearable system, including smart insoles and smart wristbands, was proposed to measure LVL accurately in workplace environments. Different from traditional methods which rely on Inertial Measurement Unit (IMU) and suffer from integral drifting errors over time, a novel barometer-based LVL measurement method was proposed in this study. To correct the environment-induced LVL measurement errors in the barometer-based method, a novel Known Vertical Location Update (KVLU) method was proposed. This method calibrates the measured LVL using a known wrist vertical location at known postures during frequently used daily activities such as standing and walking. The proposed wearable system achieved a mean absolute error (MAE) of 5.71 cm in LVL measurement. This result indicates that the proposed system has the potential to reliably measure LVL and assess the risk of LBP in manual lifting tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16097v1-abstract-full').style.display = 'none'; document.getElementById('2411.16097v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14808">arXiv:2411.14808</a> <span> [<a href="https://arxiv.org/pdf/2411.14808">pdf</a>, <a href="https://arxiv.org/format/2411.14808">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> High-Resolution Image Synthesis via Next-Token Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+D">Dengsheng Chen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jie Hu</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+T">Tiezhu Yue</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+X">Xiaoming Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14808v1-abstract-short" style="display: inline;"> Denoising with a Joint-Embedding Predictive Architecture (D-JEPA), an autoregressive model, has demonstrated outstanding performance in class-conditional image generation. However, the application of next-token prediction in high-resolution text-to-image generation remains underexplored. In this paper, we introduce D-JEPA$\cdot$T2I, an extension of D-JEPA incorporating flow matching loss, designed… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14808v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14808v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14808v1-abstract-full" style="display: none;"> Denoising with a Joint-Embedding Predictive Architecture (D-JEPA), an autoregressive model, has demonstrated outstanding performance in class-conditional image generation. However, the application of next-token prediction in high-resolution text-to-image generation remains underexplored. In this paper, we introduce D-JEPA$\cdot$T2I, an extension of D-JEPA incorporating flow matching loss, designed to enable data-efficient continuous resolution learning. D-JEPA$\cdot$T2I leverages a multimodal visual transformer to effectively integrate textual and visual features and adopts Visual Rotary Positional Embedding (VoPE) to facilitate continuous resolution learning. Furthermore, we devise a data feedback mechanism that significantly enhances data utilization efficiency. For the first time, we achieve state-of-the-art \textbf{high-resolution} image synthesis via next-token prediction. The experimental code and pretrained models will be open-sourced at \url{https://d-jepa.github.io/t2i}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14808v1-abstract-full').style.display = 'none'; document.getElementById('2411.14808v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">30 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13934">arXiv:2411.13934</a> <span> [<a href="https://arxiv.org/pdf/2411.13934">pdf</a>, <a href="https://arxiv.org/format/2411.13934">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Learning to Cooperate with Humans using Generative Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yancheng Liang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Daphne Chen</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+A">Abhishek Gupta</a>, <a href="/search/cs?searchtype=author&query=Du%2C+S+S">Simon S. Du</a>, <a href="/search/cs?searchtype=author&query=Jaques%2C+N">Natasha Jaques</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13934v1-abstract-short" style="display: inline;"> Training agents that can coordinate zero-shot with humans is a key mission in multi-agent reinforcement learning (MARL). Current algorithms focus on training simulated human partner policies which are then used to train a Cooperator agent. The simulated human is produced either through behavior cloning over a dataset of human cooperation behavior, or by using MARL to create a population of simulat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13934v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13934v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13934v1-abstract-full" style="display: none;"> Training agents that can coordinate zero-shot with humans is a key mission in multi-agent reinforcement learning (MARL). Current algorithms focus on training simulated human partner policies which are then used to train a Cooperator agent. The simulated human is produced either through behavior cloning over a dataset of human cooperation behavior, or by using MARL to create a population of simulated agents. However, these approaches often struggle to produce a Cooperator that can coordinate well with real humans, since the simulated humans fail to cover the diverse strategies and styles employed by people in the real world. We show \emph{learning a generative model of human partners} can effectively address this issue. Our model learns a latent variable representation of the human that can be regarded as encoding the human's unique strategy, intention, experience, or style. This generative model can be flexibly trained from any (human or neural policy) agent interaction data. By sampling from the latent space, we can use the generative model to produce different partners to train Cooperator agents. We evaluate our method -- \textbf{G}enerative \textbf{A}gent \textbf{M}odeling for \textbf{M}ulti-agent \textbf{A}daptation (GAMMA) -- on Overcooked, a challenging cooperative cooking game that has become a standard benchmark for zero-shot coordination. We conduct an evaluation with real human teammates, and the results show that GAMMA consistently improves performance, whether the generative model is trained on simulated populations or human datasets. Further, we propose a method for posterior sampling from the generative model that is biased towards the human data, enabling us to efficiently improve performance with only a small amount of expensive human interaction data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13934v1-abstract-full').style.display = 'none'; document.getElementById('2411.13934v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13873">arXiv:2411.13873</a> <span> [<a href="https://arxiv.org/pdf/2411.13873">pdf</a>, <a href="https://arxiv.org/format/2411.13873">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Sli2Vol+: Segmenting 3D Medical Images Based on an Object Estimation Guided Correspondence Flow Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=An%2C+D">Delin An</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+P">Pengfei Gu</a>, <a href="/search/cs?searchtype=author&query=Sonka%2C+M">Milan Sonka</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chaoli Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D+Z">Danny Z. Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13873v1-abstract-short" style="display: inline;"> Deep learning (DL) methods have shown remarkable successes in medical image segmentation, often using large amounts of annotated data for model training. However, acquiring a large number of diverse labeled 3D medical image datasets is highly difficult and expensive. Recently, mask propagation DL methods were developed to reduce the annotation burden on 3D medical images. For example, Sli2Vol~\cit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13873v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13873v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13873v1-abstract-full" style="display: none;"> Deep learning (DL) methods have shown remarkable successes in medical image segmentation, often using large amounts of annotated data for model training. However, acquiring a large number of diverse labeled 3D medical image datasets is highly difficult and expensive. Recently, mask propagation DL methods were developed to reduce the annotation burden on 3D medical images. For example, Sli2Vol~\cite{yeung2021sli2vol} proposed a self-supervised framework (SSF) to learn correspondences by matching neighboring slices via slice reconstruction in the training stage; the learned correspondences were then used to propagate a labeled slice to other slices in the test stage. But, these methods are still prone to error accumulation due to the inter-slice propagation of reconstruction errors. Also, they do not handle discontinuities well, which can occur between consecutive slices in 3D images, as they emphasize exploiting object continuity. To address these challenges, in this work, we propose a new SSF, called \proposed, {for segmenting any anatomical structures in 3D medical images using only a single annotated slice per training and testing volume.} Specifically, in the training stage, we first propagate an annotated 2D slice of a training volume to the other slices, generating pseudo-labels (PLs). Then, we develop a novel Object Estimation Guided Correspondence Flow Network to learn reliable correspondences between consecutive slices and corresponding PLs in a self-supervised manner. In the test stage, such correspondences are utilized to propagate a single annotated slice to the other slices of a test volume. We demonstrate the effectiveness of our method on various medical image segmentation tasks with different datasets, showing better generalizability across different organs, modalities, and modals. Code is available at \url{https://github.com/adlsn/Sli2Volplus} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13873v1-abstract-full').style.display = 'none'; document.getElementById('2411.13873v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13239">arXiv:2411.13239</a> <span> [<a href="https://arxiv.org/pdf/2411.13239">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Transforming the Hybrid Cloud for Emerging AI Workloads </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+D">Deming Chen</a>, <a href="/search/cs?searchtype=author&query=Youssef%2C+A">Alaa Youssef</a>, <a href="/search/cs?searchtype=author&query=Pendse%2C+R">Ruchi Pendse</a>, <a href="/search/cs?searchtype=author&query=Schleife%2C+A">Andr茅 Schleife</a>, <a href="/search/cs?searchtype=author&query=Clark%2C+B+K">Bryan K. Clark</a>, <a href="/search/cs?searchtype=author&query=Hamann%2C+H">Hendrik Hamann</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Jingrui He</a>, <a href="/search/cs?searchtype=author&query=Laino%2C+T">Teodoro Laino</a>, <a href="/search/cs?searchtype=author&query=Varshney%2C+L">Lav Varshney</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuxiong Wang</a>, <a href="/search/cs?searchtype=author&query=Sil%2C+A">Avirup Sil</a>, <a href="/search/cs?searchtype=author&query=Jabbarvand%2C+R">Reyhaneh Jabbarvand</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+T">Tianyin Xu</a>, <a href="/search/cs?searchtype=author&query=Kindratenko%2C+V">Volodymyr Kindratenko</a>, <a href="/search/cs?searchtype=author&query=Costa%2C+C">Carlos Costa</a>, <a href="/search/cs?searchtype=author&query=Adve%2C+S">Sarita Adve</a>, <a href="/search/cs?searchtype=author&query=Mendis%2C+C">Charith Mendis</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Minjia Zhang</a>, <a href="/search/cs?searchtype=author&query=N%C3%BA%C3%B1ez-Corrales%2C+S">Santiago N煤帽ez-Corrales</a>, <a href="/search/cs?searchtype=author&query=Ganti%2C+R">Raghu Ganti</a>, <a href="/search/cs?searchtype=author&query=Srivatsa%2C+M">Mudhakar Srivatsa</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+N+S">Nam Sung Kim</a>, <a href="/search/cs?searchtype=author&query=Torrellas%2C+J">Josep Torrellas</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jian Huang</a>, <a href="/search/cs?searchtype=author&query=Seelam%2C+S">Seetharami Seelam</a> , et al. (19 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13239v1-abstract-short" style="display: inline;"> This white paper, developed through close collaboration between IBM Research and UIUC researchers within the IIDAI Institute, envisions transforming hybrid cloud systems to meet the growing complexity of AI workloads through innovative, full-stack co-design approaches, emphasizing usability, manageability, affordability, adaptability, efficiency, and scalability. By integrating cutting-edge techno… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13239v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13239v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13239v1-abstract-full" style="display: none;"> This white paper, developed through close collaboration between IBM Research and UIUC researchers within the IIDAI Institute, envisions transforming hybrid cloud systems to meet the growing complexity of AI workloads through innovative, full-stack co-design approaches, emphasizing usability, manageability, affordability, adaptability, efficiency, and scalability. By integrating cutting-edge technologies such as generative and agentic AI, cross-layer automation and optimization, unified control plane, and composable and adaptive system architecture, the proposed framework addresses critical challenges in energy efficiency, performance, and cost-effectiveness. Incorporating quantum computing as it matures will enable quantum-accelerated simulations for materials science, climate modeling, and other high-impact domains. Collaborative efforts between academia and industry are central to this vision, driving advancements in foundation models for material design and climate solutions, scalable multimodal data processing, and enhanced physics-based AI emulators for applications like weather forecasting and carbon sequestration. Research priorities include advancing AI agentic systems, LLM as an Abstraction (LLMaaA), AI model optimization and unified abstractions across heterogeneous infrastructure, end-to-end edge-cloud transformation, efficient programming model, middleware and platform, secure infrastructure, application-adaptive cloud systems, and new quantum-classical collaborative workflows. These ideas and solutions encompass both theoretical and practical research questions, requiring coordinated input and support from the research community. This joint initiative aims to establish hybrid clouds as secure, efficient, and sustainable platforms, fostering breakthroughs in AI-driven applications and scientific discovery across academia, industry, and society. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13239v1-abstract-full').style.display = 'none'; document.getElementById('2411.13239v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">70 pages, 27 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12792">arXiv:2411.12792</a> <span> [<a href="https://arxiv.org/pdf/2411.12792">pdf</a>, <a href="https://arxiv.org/format/2411.12792">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CLIC: Contrastive Learning Framework for Unsupervised Image Complexity Representation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shipeng Liu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+L">Liang Zhao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Dengfeng Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12792v1-abstract-short" style="display: inline;"> As an essential visual attribute, image complexity affects human image comprehension and directly influences the performance of computer vision tasks. However, accurately assessing and quantifying image complexity faces significant challenges. Previous works needed more generalization capabilities and well-labeled datasets to learn image complexity features. However, creating such datasets require… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12792v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12792v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12792v1-abstract-full" style="display: none;"> As an essential visual attribute, image complexity affects human image comprehension and directly influences the performance of computer vision tasks. However, accurately assessing and quantifying image complexity faces significant challenges. Previous works needed more generalization capabilities and well-labeled datasets to learn image complexity features. However, creating such datasets requires expensive manual labeling costs, and the models inevitably learn about human subjective biases. To address the above problems, we propose CLIC, an unsupervised framework based on contrastive learning, for learning image complexity representations. The method learns image complexity features on unlabeled data, avoiding the high labeling cost. Specifically, we propose a unique positive and negative sample selection strategy to reinforce the differences in complexity features. At the same time, we introduce an image prior-based Complexity-Aware Loss to constrain the learning process of the model. We conducted extensive experiments for verification, and the results show that CLIC can effectively learn the image complexity representation. CLIC obtained competitive results with supervised methods by fine-tuning on IC9600. In addition, CLIC applied to downstream tasks shows significant performance improvements, demonstrating the potential for application in various real-world scenarios. \href{https://github.com/xauat-liushipeng/CLIC}{code} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12792v1-abstract-full').style.display = 'none'; document.getElementById('2411.12792v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12478">arXiv:2411.12478</a> <span> [<a href="https://arxiv.org/pdf/2411.12478">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Robotic transcatheter tricuspid valve replacement with hybrid enhanced intelligence: a new paradigm and first-in-vivo study </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuangyi Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Haichuan Lin</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yiping Xie</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziqi Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Dong Chen</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+L">Longyue Tan</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+X">Xilong Hou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chen Chen</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xiao-Hu Zhou</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+S">Shengtao Lin</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+F">Fei Pan</a>, <a href="/search/cs?searchtype=author&query=So%2C+K+C">Kent Chak-Yu So</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+Z">Zeng-Guang Hou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12478v1-abstract-short" style="display: inline;"> Transcatheter tricuspid valve replacement (TTVR) is the latest treatment for tricuspid regurgitation and is in the early stages of clinical adoption. Intelligent robotic approaches are expected to overcome the challenges of surgical manipulation and widespread dissemination, but systems and protocols with high clinical utility have not yet been reported. In this study, we propose a complete soluti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12478v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12478v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12478v1-abstract-full" style="display: none;"> Transcatheter tricuspid valve replacement (TTVR) is the latest treatment for tricuspid regurgitation and is in the early stages of clinical adoption. Intelligent robotic approaches are expected to overcome the challenges of surgical manipulation and widespread dissemination, but systems and protocols with high clinical utility have not yet been reported. In this study, we propose a complete solution that includes a passive stabilizer, robotic drive, detachable delivery catheter and valve manipulation mechanism. Working towards autonomy, a hybrid augmented intelligence approach based on reinforcement learning, Monte Carlo probabilistic maps and human-robot co-piloted control was introduced. Systematic tests in phantom and first-in-vivo animal experiments were performed to verify that the system design met the clinical requirement. Furthermore, the experimental results confirmed the advantages of co-piloted control over conventional master-slave control in terms of time efficiency, control efficiency, autonomy and stability of operation. In conclusion, this study provides a comprehensive pathway for robotic TTVR and, to our knowledge, completes the first animal study that not only successfully demonstrates the application of hybrid enhanced intelligence in interventional robotics, but also provides a solution with high application value for a cutting-edge procedure. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12478v1-abstract-full').style.display = 'none'; document.getElementById('2411.12478v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12071">arXiv:2411.12071</a> <span> [<a href="https://arxiv.org/pdf/2411.12071">pdf</a>, <a href="https://arxiv.org/ps/2411.12071">ps</a>, <a href="https://arxiv.org/format/2411.12071">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Theoretical Corrections and the Leveraging of Reinforcement Learning to Enhance Triangle Attack </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Meng%2C+N">Nicole Meng</a>, <a href="/search/cs?searchtype=author&query=Manicke%2C+C">Caleb Manicke</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">David Chen</a>, <a href="/search/cs?searchtype=author&query=Lao%2C+Y">Yingjie Lao</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+C">Caiwen Ding</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+P">Pengyu Hong</a>, <a href="/search/cs?searchtype=author&query=Mahmood%2C+K">Kaleel Mahmood</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12071v1-abstract-short" style="display: inline;"> Adversarial examples represent a serious issue for the application of machine learning models in many sensitive domains. For generating adversarial examples, decision based black-box attacks are one of the most practical techniques as they only require query access to the model. One of the most recently proposed state-of-the-art decision based black-box attacks is Triangle Attack (TA). In this pap… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12071v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12071v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12071v1-abstract-full" style="display: none;"> Adversarial examples represent a serious issue for the application of machine learning models in many sensitive domains. For generating adversarial examples, decision based black-box attacks are one of the most practical techniques as they only require query access to the model. One of the most recently proposed state-of-the-art decision based black-box attacks is Triangle Attack (TA). In this paper, we offer a high-level description of TA and explain potential theoretical limitations. We then propose a new decision based black-box attack, Triangle Attack with Reinforcement Learning (TARL). Our new attack addresses the limits of TA by leveraging reinforcement learning. This creates an attack that can achieve similar, if not better, attack accuracy than TA with half as many queries on state-of-the-art classifiers and defenses across ImageNet and CIFAR-10. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12071v1-abstract-full').style.display = 'none'; document.getElementById('2411.12071v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11551">arXiv:2411.11551</a> <span> [<a href="https://arxiv.org/pdf/2411.11551">pdf</a>, <a href="https://arxiv.org/format/2411.11551">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Simple But Not Secure: An Empirical Security Analysis of Two-factor Authentication Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhi Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xin Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Du Chen</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+H">Han Gao</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+M">Meiqi Tian</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+Y">Yan Jia</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wanpeng Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11551v1-abstract-short" style="display: inline;"> To protect users from data breaches and phishing attacks, service providers typically implement two-factor authentication (2FA) to add an extra layer of security against suspicious login attempts. However, since 2FA can sometimes hinder user experience by introducing additional steps, many websites aim to reduce inconvenience by minimizing the frequency of 2FA prompts. One approach to achieve this… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11551v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11551v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11551v1-abstract-full" style="display: none;"> To protect users from data breaches and phishing attacks, service providers typically implement two-factor authentication (2FA) to add an extra layer of security against suspicious login attempts. However, since 2FA can sometimes hinder user experience by introducing additional steps, many websites aim to reduce inconvenience by minimizing the frequency of 2FA prompts. One approach to achieve this is by storing the user's ``Remember the Device'' preference in a cookie. As a result, users are only prompted for 2FA when this cookie expires or if they log in from a new device. To understand and improve the security of 2FA systems in real-world settings, we propose SE2FA, a vulnerability evaluation framework designed to detect vulnerabilities in 2FA systems. This framework enables us to analyze the security of 407 2FA systems across popular websites from the Tranco Top 10,000 list. Our analysis and evaluation found three zero-day vulnerabilities on three service providers that could allow an attacker to access a victim's account without possessing the victim's second authentication factor, thereby bypassing 2FA protections entirely. A further investigation found that these vulnerabilities stem from design choices aimed at simplifying 2FA for users but that unintentionally reduce its security effectiveness. We have disclosed these findings to the affected websites and assisted them in mitigating the risks. Based on the insights from this research, we provide practical recommendations for countermeasures to strengthen 2FA security and address these newly identified threats. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11551v1-abstract-full').style.display = 'none'; document.getElementById('2411.11551v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11477">arXiv:2411.11477</a> <span> [<a href="https://arxiv.org/pdf/2411.11477">pdf</a>, <a href="https://arxiv.org/format/2411.11477">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SL-YOLO: A Stronger and Lighter Drone Target Detection Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+D">Defan Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Luchan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11477v1-abstract-short" style="display: inline;"> Detecting small objects in complex scenes, such as those captured by drones, is a daunting challenge due to the difficulty in capturing the complex features of small targets. While the YOLO family has achieved great success in large target detection, its performance is less than satisfactory when faced with small targets. Because of this, this paper proposes a revolutionary model SL-YOLO (Stronger… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11477v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11477v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11477v1-abstract-full" style="display: none;"> Detecting small objects in complex scenes, such as those captured by drones, is a daunting challenge due to the difficulty in capturing the complex features of small targets. While the YOLO family has achieved great success in large target detection, its performance is less than satisfactory when faced with small targets. Because of this, this paper proposes a revolutionary model SL-YOLO (Stronger and Lighter YOLO) that aims to break the bottleneck of small target detection. We propose the Hierarchical Extended Path Aggregation Network (HEPAN), a pioneering cross-scale feature fusion method that can ensure unparalleled detection accuracy even in the most challenging environments. At the same time, without sacrificing detection capabilities, we design the C2fDCB lightweight module and add the SCDown downsampling module to greatly reduce the model's parameters and computational complexity. Our experimental results on the VisDrone2019 dataset reveal a significant improvement in performance, with mAP@0.5 jumping from 43.0% to 46.9% and mAP@0.5:0.95 increasing from 26.0% to 28.9%. At the same time, the model parameters are reduced from 11.1M to 9.6M, and the FPS can reach 132, making it an ideal solution for real-time small object detection in resource-constrained environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11477v1-abstract-full').style.display = 'none'; document.getElementById('2411.11477v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11361">arXiv:2411.11361</a> <span> [<a href="https://arxiv.org/pdf/2411.11361">pdf</a>, <a href="https://arxiv.org/format/2411.11361">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Scalable Autoregressive Monocular Depth Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jinhong Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jian Liu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+D">Dongqi Tang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Weiqiang Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wentong Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Danny Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J+i">J intai Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jian Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11361v1-abstract-short" style="display: inline;"> This paper proposes a new autoregressive model as an effective and scalable monocular depth estimator. Our idea is simple: We tackle the monocular depth estimation (MDE) task with an autoregressive prediction paradigm, based on two core designs. First, our depth autoregressive model (DAR) treats the depth map of different resolutions as a set of tokens, and conducts the low-to-high resolution auto… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11361v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11361v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11361v1-abstract-full" style="display: none;"> This paper proposes a new autoregressive model as an effective and scalable monocular depth estimator. Our idea is simple: We tackle the monocular depth estimation (MDE) task with an autoregressive prediction paradigm, based on two core designs. First, our depth autoregressive model (DAR) treats the depth map of different resolutions as a set of tokens, and conducts the low-to-high resolution autoregressive objective with a patch-wise casual mask. Second, our DAR recursively discretizes the entire depth range into more compact intervals, and attains the coarse-to-fine granularity autoregressive objective in an ordinal-regression manner. By coupling these two autoregressive objectives, our DAR establishes new state-of-the-art (SOTA) on KITTI and NYU Depth v2 by clear margins. Further, our scalable approach allows us to scale the model up to 2.0B and achieve the best RMSE of 1.799 on the KITTI dataset (5% improvement) compared to 1.896 by the current SOTA (Depth Anything). DAR further showcases zero-shot generalization ability on unseen datasets. These results suggest that DAR yields superior performance with an autoregressive prediction paradigm, providing a promising approach to equip modern autoregressive large models (e.g., GPT-4o) with depth estimation capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11361v1-abstract-full').style.display = 'none'; document.getElementById('2411.11361v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11144">arXiv:2411.11144</a> <span> [<a href="https://arxiv.org/pdf/2411.11144">pdf</a>, <a href="https://arxiv.org/format/2411.11144">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> CLMIA: Membership Inference Attacks via Unsupervised Contrastive Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+D">Depeng Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiao Liu</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+J">Jie Cui</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+H">Hong Zhong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11144v1-abstract-short" style="display: inline;"> Since machine learning model is often trained on a limited data set, the model is trained multiple times on the same data sample, which causes the model to memorize most of the training set data. Membership Inference Attacks (MIAs) exploit this feature to determine whether a data sample is used for training a machine learning model. However, in realistic scenarios, it is difficult for the adversar… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11144v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11144v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11144v1-abstract-full" style="display: none;"> Since machine learning model is often trained on a limited data set, the model is trained multiple times on the same data sample, which causes the model to memorize most of the training set data. Membership Inference Attacks (MIAs) exploit this feature to determine whether a data sample is used for training a machine learning model. However, in realistic scenarios, it is difficult for the adversary to obtain enough qualified samples that mark accurate identity information, especially since most samples are non-members in real world applications. To address this limitation, in this paper, we propose a new attack method called CLMIA, which uses unsupervised contrastive learning to train an attack model without using extra membership status information. Meanwhile, in CLMIA, we require only a small amount of data with known membership status to fine-tune the attack model. Experimental results demonstrate that CLMIA performs better than existing attack methods for different datasets and model structures, especially with data with less marked identity information. In addition, we experimentally find that the attack performs differently for different proportions of labeled identity information for member and non-member data. More analysis proves that our attack method performs better with less labeled identity information, which applies to more realistic scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11144v1-abstract-full').style.display = 'none'; document.getElementById('2411.11144v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07725">arXiv:2411.07725</a> <span> [<a href="https://arxiv.org/pdf/2411.07725">pdf</a>, <a href="https://arxiv.org/format/2411.07725">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ALOcc: Adaptive Lifting-based 3D Semantic Occupancy and Cost Volume-based Flow Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+D">Dubing Chen</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+J">Jin Fang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+W">Wencheng Han</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xinjing Cheng</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Junbo Yin</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chenzhong Xu</a>, <a href="/search/cs?searchtype=author&query=Khan%2C+F+S">Fahad Shahbaz Khan</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+J">Jianbing Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07725v1-abstract-short" style="display: inline;"> Vision-based semantic occupancy and flow prediction plays a crucial role in providing spatiotemporal cues for real-world tasks, such as autonomous driving. Existing methods prioritize higher accuracy to cater to the demands of these tasks. In this work, we strive to improve performance by introducing a series of targeted improvements for 3D semantic occupancy prediction and flow estimation. First,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07725v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07725v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07725v1-abstract-full" style="display: none;"> Vision-based semantic occupancy and flow prediction plays a crucial role in providing spatiotemporal cues for real-world tasks, such as autonomous driving. Existing methods prioritize higher accuracy to cater to the demands of these tasks. In this work, we strive to improve performance by introducing a series of targeted improvements for 3D semantic occupancy prediction and flow estimation. First, we introduce an occlusion-aware adaptive lifting mechanism with a depth denoising technique to improve the robustness of 2D-to-3D feature transformation and reduce the reliance on depth priors. Second, we strengthen the semantic consistency between 3D features and their original 2D modalities by utilizing shared semantic prototypes to jointly constrain both 2D and 3D features. This is complemented by confidence- and category-based sampling strategies to tackle long-tail challenges in 3D space. To alleviate the feature encoding burden in the joint prediction of semantics and flow, we propose a BEV cost volume-based prediction method that links flow and semantic features through a cost volume and employs a classification-regression supervision scheme to address the varying flow scales in dynamic scenes. Our purely convolutional architecture framework, named ALOcc, achieves an optimal tradeoff between speed and accuracy achieving state-of-the-art results on multiple benchmarks. On Occ3D and training without the camera visible mask, our ALOcc achieves an absolute gain of 2.5\% in terms of RayIoU while operating at a comparable speed compared to the state-of-the-art, using the same input size (256$\times$704) and ResNet-50 backbone. Our method also achieves 2nd place in the CVPR24 Occupancy and Flow Prediction Competition. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07725v1-abstract-full').style.display = 'none'; document.getElementById('2411.07725v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07326">arXiv:2411.07326</a> <span> [<a href="https://arxiv.org/pdf/2411.07326">pdf</a>, <a href="https://arxiv.org/format/2411.07326">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> $SE(3)$ Equivariant Ray Embeddings for Implicit Multi-View Depth Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yinshuang Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Dian Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+K">Katherine Liu</a>, <a href="/search/cs?searchtype=author&query=Zakharov%2C+S">Sergey Zakharov</a>, <a href="/search/cs?searchtype=author&query=Ambrus%2C+R">Rares Ambrus</a>, <a href="/search/cs?searchtype=author&query=Daniilidis%2C+K">Kostas Daniilidis</a>, <a href="/search/cs?searchtype=author&query=Guizilini%2C+V">Vitor Guizilini</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07326v1-abstract-short" style="display: inline;"> Incorporating inductive bias by embedding geometric entities (such as rays) as input has proven successful in multi-view learning. However, the methods adopting this technique typically lack equivariance, which is crucial for effective 3D learning. Equivariance serves as a valuable inductive prior, aiding in the generation of robust multi-view features for 3D scene understanding. In this paper, we… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07326v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07326v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07326v1-abstract-full" style="display: none;"> Incorporating inductive bias by embedding geometric entities (such as rays) as input has proven successful in multi-view learning. However, the methods adopting this technique typically lack equivariance, which is crucial for effective 3D learning. Equivariance serves as a valuable inductive prior, aiding in the generation of robust multi-view features for 3D scene understanding. In this paper, we explore the application of equivariant multi-view learning to depth estimation, not only recognizing its significance for computer vision and robotics but also addressing the limitations of previous research. Most prior studies have either overlooked equivariance in this setting or achieved only approximate equivariance through data augmentation, which often leads to inconsistencies across different reference frames. To address this issue, we propose to embed $SE(3)$ equivariance into the Perceiver IO architecture. We employ Spherical Harmonics for positional encoding to ensure 3D rotation equivariance, and develop a specialized equivariant encoder and decoder within the Perceiver IO architecture. To validate our model, we applied it to the task of stereo depth estimation, achieving state of the art results on real-world datasets without explicit geometric constraints or extensive data augmentation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07326v1-abstract-full').style.display = 'none'; document.getElementById('2411.07326v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07175">arXiv:2411.07175</a> <span> [<a href="https://arxiv.org/pdf/2411.07175">pdf</a>, <a href="https://arxiv.org/format/2411.07175">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Continual Memorization of Factoids in Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+H">Howard Chen</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+J">Jiayi Geng</a>, <a href="/search/cs?searchtype=author&query=Bhaskar%2C+A">Adithya Bhaskar</a>, <a href="/search/cs?searchtype=author&query=Friedman%2C+D">Dan Friedman</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Danqi Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07175v1-abstract-short" style="display: inline;"> Large language models can absorb a massive amount of knowledge through pretraining, but pretraining is inefficient for acquiring long-tailed or specialized facts. Therefore, fine-tuning on specialized or new knowledge that reflects changes in the world has become popular, though it risks disrupting the model's original capabilities. We study this fragility in the context of continual memorization,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07175v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07175v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07175v1-abstract-full" style="display: none;"> Large language models can absorb a massive amount of knowledge through pretraining, but pretraining is inefficient for acquiring long-tailed or specialized facts. Therefore, fine-tuning on specialized or new knowledge that reflects changes in the world has become popular, though it risks disrupting the model's original capabilities. We study this fragility in the context of continual memorization, where the model is trained on a small set of long-tail factoids (factual associations) and must retain these factoids after multiple stages of subsequent training on other datasets. Through extensive experiments, we show that LLMs suffer from forgetting across a wide range of subsequent tasks, and simple replay techniques do not fully prevent forgetting, especially when the factoid datasets are trained in the later stages. We posit that there are two ways to alleviate forgetting: 1) protect the memorization process as the model learns the factoids, or 2) reduce interference from training in later stages. With this insight, we develop an effective mitigation strategy: REMIX (Random and Generic Data Mixing). REMIX prevents forgetting by mixing generic data sampled from pretraining corpora or even randomly generated word sequences during each stage, despite being unrelated to the memorized factoids in the first stage. REMIX can recover performance from severe forgetting, often outperforming replay-based methods that have access to the factoids from the first stage. We then analyze how REMIX alters the learning process and find that successful forgetting prevention is associated with a pattern: the model stores factoids in earlier layers than usual and diversifies the set of layers that store these factoids. The efficacy of REMIX invites further investigation into the underlying dynamics of memorization and forgetting, opening exciting possibilities for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07175v1-abstract-full').style.display = 'none'; document.getElementById('2411.07175v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06602">arXiv:2411.06602</a> <span> [<a href="https://arxiv.org/pdf/2411.06602">pdf</a>, <a href="https://arxiv.org/format/2411.06602">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Adaptive and Temporally Consistent Gaussian Surfels for Multi-view Dynamic Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+D">Decai Chen</a>, <a href="/search/cs?searchtype=author&query=Oberson%2C+B">Brianne Oberson</a>, <a href="/search/cs?searchtype=author&query=Feldmann%2C+I">Ingo Feldmann</a>, <a href="/search/cs?searchtype=author&query=Schreer%2C+O">Oliver Schreer</a>, <a href="/search/cs?searchtype=author&query=Hilsmann%2C+A">Anna Hilsmann</a>, <a href="/search/cs?searchtype=author&query=Eisert%2C+P">Peter Eisert</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06602v1-abstract-short" style="display: inline;"> 3D Gaussian Splatting has recently achieved notable success in novel view synthesis for dynamic scenes and geometry reconstruction in static scenes. Building on these advancements, early methods have been developed for dynamic surface reconstruction by globally optimizing entire sequences. However, reconstructing dynamic scenes with significant topology changes, emerging or disappearing objects, a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06602v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06602v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06602v1-abstract-full" style="display: none;"> 3D Gaussian Splatting has recently achieved notable success in novel view synthesis for dynamic scenes and geometry reconstruction in static scenes. Building on these advancements, early methods have been developed for dynamic surface reconstruction by globally optimizing entire sequences. However, reconstructing dynamic scenes with significant topology changes, emerging or disappearing objects, and rapid movements remains a substantial challenge, particularly for long sequences. To address these issues, we propose AT-GS, a novel method for reconstructing high-quality dynamic surfaces from multi-view videos through per-frame incremental optimization. To avoid local minima across frames, we introduce a unified and adaptive gradient-aware densification strategy that integrates the strengths of conventional cloning and splitting techniques. Additionally, we reduce temporal jittering in dynamic surfaces by ensuring consistency in curvature maps across consecutive frames. Our method achieves superior accuracy and temporal coherence in dynamic surface reconstruction, delivering high-fidelity space-time novel view synthesis, even in complex and challenging scenes. Extensive experiments on diverse multi-view video datasets demonstrate the effectiveness of our approach, showing clear advantages over baseline methods. Project page: \url{https://fraunhoferhhi.github.io/AT-GS} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06602v1-abstract-full').style.display = 'none'; document.getElementById('2411.06602v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05718">arXiv:2411.05718</a> <span> [<a href="https://arxiv.org/pdf/2411.05718">pdf</a>, <a href="https://arxiv.org/format/2411.05718">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Retrospective on the Robot Air Hockey Challenge: Benchmarking Robust, Reliable, and Safe Learning Techniques for Real-world Robotics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+P">Puze Liu</a>, <a href="/search/cs?searchtype=author&query=G%C3%BCnster%2C+J">Jonas G眉nster</a>, <a href="/search/cs?searchtype=author&query=Funk%2C+N">Niklas Funk</a>, <a href="/search/cs?searchtype=author&query=Gr%C3%B6ger%2C+S">Simon Gr枚ger</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Dong Chen</a>, <a href="/search/cs?searchtype=author&query=Bou-Ammar%2C+H">Haitham Bou-Ammar</a>, <a href="/search/cs?searchtype=author&query=Jankowski%2C+J">Julius Jankowski</a>, <a href="/search/cs?searchtype=author&query=Mari%C4%87%2C+A">Ante Mari膰</a>, <a href="/search/cs?searchtype=author&query=Calinon%2C+S">Sylvain Calinon</a>, <a href="/search/cs?searchtype=author&query=Orsula%2C+A">Andrej Orsula</a>, <a href="/search/cs?searchtype=author&query=Olivares-Mendez%2C+M">Miguel Olivares-Mendez</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Hongyi Zhou</a>, <a href="/search/cs?searchtype=author&query=Lioutikov%2C+R">Rudolf Lioutikov</a>, <a href="/search/cs?searchtype=author&query=Neumann%2C+G">Gerhard Neumann</a>, <a href="/search/cs?searchtype=author&query=Zhalehmehrabi%2C+A+L+A">Amarildo Likmeta Amirhossein Zhalehmehrabi</a>, <a href="/search/cs?searchtype=author&query=Bonenfant%2C+T">Thomas Bonenfant</a>, <a href="/search/cs?searchtype=author&query=Restelli%2C+M">Marcello Restelli</a>, <a href="/search/cs?searchtype=author&query=Tateo%2C+D">Davide Tateo</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Peters%2C+J">Jan Peters</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05718v1-abstract-short" style="display: inline;"> Machine learning methods have a groundbreaking impact in many application domains, but their application on real robotic platforms is still limited. Despite the many challenges associated with combining machine learning technology with robotics, robot learning remains one of the most promising directions for enhancing the capabilities of robots. When deploying learning-based approaches on real rob… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05718v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05718v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05718v1-abstract-full" style="display: none;"> Machine learning methods have a groundbreaking impact in many application domains, but their application on real robotic platforms is still limited. Despite the many challenges associated with combining machine learning technology with robotics, robot learning remains one of the most promising directions for enhancing the capabilities of robots. When deploying learning-based approaches on real robots, extra effort is required to address the challenges posed by various real-world factors. To investigate the key factors influencing real-world deployment and to encourage original solutions from different researchers, we organized the Robot Air Hockey Challenge at the NeurIPS 2023 conference. We selected the air hockey task as a benchmark, encompassing low-level robotics problems and high-level tactics. Different from other machine learning-centric benchmarks, participants need to tackle practical challenges in robotics, such as the sim-to-real gap, low-level control issues, safety problems, real-time requirements, and the limited availability of real-world data. Furthermore, we focus on a dynamic environment, removing the typical assumption of quasi-static motions of other real-world benchmarks. The competition's results show that solutions combining learning-based approaches with prior knowledge outperform those relying solely on data when real-world deployment is challenging. Our ablation study reveals which real-world factors may be overlooked when building a learning-based solution. The successful real-world air hockey deployment of best-performing agents sets the foundation for future competitions and follow-up research directions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05718v1-abstract-full').style.display = 'none'; document.getElementById('2411.05718v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accept at NeurIPS 2024 Dataset and Benchmark Track</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04997">arXiv:2411.04997</a> <span> [<a href="https://arxiv.org/pdf/2411.04997">pdf</a>, <a href="https://arxiv.org/format/2411.04997">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> LLM2CLIP: Powerful Language Model Unlocks Richer Visual Representation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+W">Weiquan Huang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+A">Aoqi Wu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yifan Yang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+X">Xufang Luo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yuqing Yang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+L">Liang Hu</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+Q">Qi Dai</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+X">Xiyang Dai</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Dongdong Chen</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+C">Chong Luo</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+L">Lili Qiu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04997v3-abstract-short" style="display: inline;"> CLIP is a foundational multimodal model that aligns image and text features into a shared space using contrastive learning on large-scale image-text pairs. Its strength lies in leveraging natural language as a rich supervisory signal. With the rapid progress of large language models (LLMs), we explore their potential to further enhance CLIP's multimodal representation learning. This work introduce… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04997v3-abstract-full').style.display = 'inline'; document.getElementById('2411.04997v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04997v3-abstract-full" style="display: none;"> CLIP is a foundational multimodal model that aligns image and text features into a shared space using contrastive learning on large-scale image-text pairs. Its strength lies in leveraging natural language as a rich supervisory signal. With the rapid progress of large language models (LLMs), we explore their potential to further enhance CLIP's multimodal representation learning. This work introduces a fine-tuning approach that integrates LLMs with the pretrained CLIP visual encoder, leveraging LLMs' advanced text understanding and open-world knowledge to improve CLIP's ability to process long and complex captions. To address the challenge of LLMs' autoregressive nature, we propose a caption-to-caption contrastive learning framework to enhance the discriminative power of their outputs. Our method achieves substantial performance gains on various downstream tasks, demonstrating the effectiveness of combining LLMs with CLIP for enhanced multimodal learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04997v3-abstract-full').style.display = 'none'; document.getElementById('2411.04997v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04106">arXiv:2411.04106</a> <span> [<a href="https://arxiv.org/pdf/2411.04106">pdf</a>, <a href="https://arxiv.org/format/2411.04106">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Comparative Study of Deep Reinforcement Learning for Crop Production Management </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Balderas%2C+J">Joseph Balderas</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Dong Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yanbo Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Li Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ren-Cang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04106v1-abstract-short" style="display: inline;"> Crop production management is essential for optimizing yield and minimizing a field's environmental impact to crop fields, yet it remains challenging due to the complex and stochastic processes involved. Recently, researchers have turned to machine learning to address these complexities. Specifically, reinforcement learning (RL), a cutting-edge approach designed to learn optimal decision-making st… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04106v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04106v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04106v1-abstract-full" style="display: none;"> Crop production management is essential for optimizing yield and minimizing a field's environmental impact to crop fields, yet it remains challenging due to the complex and stochastic processes involved. Recently, researchers have turned to machine learning to address these complexities. Specifically, reinforcement learning (RL), a cutting-edge approach designed to learn optimal decision-making strategies through trial and error in dynamic environments, has emerged as a promising tool for developing adaptive crop management policies. RL models aim to optimize long-term rewards by continuously interacting with the environment, making them well-suited for tackling the uncertainties and variability inherent in crop management. Studies have shown that RL can generate crop management policies that compete with, and even outperform, expert-designed policies within simulation-based crop models. In the gym-DSSAT crop model environment, one of the most widely used simulators for crop management, proximal policy optimization (PPO) and deep Q-networks (DQN) have shown promising results. However, these methods have not yet been systematically evaluated under identical conditions. In this study, we evaluated PPO and DQN against static baseline policies across three different RL tasks, fertilization, irrigation, and mixed management, provided by the gym-DSSAT environment. To ensure a fair comparison, we used consistent default parameters, identical reward functions, and the same environment settings. Our results indicate that PPO outperforms DQN in fertilization and irrigation tasks, while DQN excels in the mixed management task. This comparative analysis provides critical insights into the strengths and limitations of each approach, advancing the development of more effective RL-based crop management strategies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04106v1-abstract-full').style.display = 'none'; document.getElementById('2411.04106v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00612">arXiv:2411.00612</a> <span> [<a href="https://arxiv.org/pdf/2411.00612">pdf</a>, <a href="https://arxiv.org/ps/2411.00612">ps</a>, <a href="https://arxiv.org/format/2411.00612">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> How to Bridge Spatial and Temporal Heterogeneity in Link Prediction? A Contrastive Method </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tai%2C+Y">Yu Tai</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xinglong Wu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hongwei Yang</a>, <a href="/search/cs?searchtype=author&query=He%2C+H">Hui He</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Duanjing Chen</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Y">Yuanming Shao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weizhe Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00612v1-abstract-short" style="display: inline;"> Temporal Heterogeneous Networks play a crucial role in capturing the dynamics and heterogeneity inherent in various real-world complex systems, rendering them a noteworthy research avenue for link prediction. However, existing methods fail to capture the fine-grained differential distribution patterns and temporal dynamic characteristics, which we refer to as spatial heterogeneity and temporal het… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00612v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00612v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00612v1-abstract-full" style="display: none;"> Temporal Heterogeneous Networks play a crucial role in capturing the dynamics and heterogeneity inherent in various real-world complex systems, rendering them a noteworthy research avenue for link prediction. However, existing methods fail to capture the fine-grained differential distribution patterns and temporal dynamic characteristics, which we refer to as spatial heterogeneity and temporal heterogeneity. To overcome such limitations, we propose a novel \textbf{C}ontrastive Learning-based \textbf{L}ink \textbf{P}rediction model, \textbf{CLP}, which employs a multi-view hierarchical self-supervised architecture to encode spatial and temporal heterogeneity. Specifically, aiming at spatial heterogeneity, we develop a spatial feature modeling layer to capture the fine-grained topological distribution patterns from node- and edge-level representations, respectively. Furthermore, aiming at temporal heterogeneity, we devise a temporal information modeling layer to perceive the evolutionary dependencies of dynamic graph topologies from time-level representations. Finally, we encode the spatial and temporal distribution heterogeneity from a contrastive learning perspective, enabling a comprehensive self-supervised hierarchical relation modeling for the link prediction task. Extensive experiments conducted on four real-world dynamic heterogeneous network datasets verify that our \mymodel consistently outperforms the state-of-the-art models, demonstrating an average improvement of 10.10\%, 13.44\% in terms of AUC and AP, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00612v1-abstract-full').style.display = 'none'; document.getElementById('2411.00612v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00577">arXiv:2411.00577</a> <span> [<a href="https://arxiv.org/pdf/2411.00577">pdf</a>, <a href="https://arxiv.org/format/2411.00577">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> WLPlan: Relational Features for Symbolic Planning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+D+Z">Dillon Z. Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00577v1-abstract-short" style="display: inline;"> Scalable learning for planning research generally involves juggling between different programming languages for handling learning and planning modules effectively. Interpreted languages such as Python are commonly used for learning routines due to their ease of use and the abundance of highly maintained learning libraries they exhibit, while compiled languages such as C++ are used for planning rou… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00577v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00577v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00577v1-abstract-full" style="display: none;"> Scalable learning for planning research generally involves juggling between different programming languages for handling learning and planning modules effectively. Interpreted languages such as Python are commonly used for learning routines due to their ease of use and the abundance of highly maintained learning libraries they exhibit, while compiled languages such as C++ are used for planning routines due to their optimised resource usage. Motivated by the need for tools for developing scalable learning planners, we introduce WLPlan, a C++ package with Python bindings which implements recent promising work for automatically generating relational features of planning tasks. Such features can be used for any downstream routine, such as learning domain control knowledge or probing and understanding planning tasks. More specifically, WLPlan provides functionality for (1) transforming planning tasks into graphs, and (2) embedding planning graphs into feature vectors via graph kernels. The source code and instructions for the installation and usage of WLPlan are available at tinyurl.com/42kymswc <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00577v1-abstract-full').style.display = 'none'; document.getElementById('2411.00577v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00399">arXiv:2411.00399</a> <span> [<a href="https://arxiv.org/pdf/2411.00399">pdf</a>, <a href="https://arxiv.org/format/2411.00399">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> StyleTex: Style Image-Guided Texture Generation for 3D Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+Z">Zhiyu Xie</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuqing Zhang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xiangjun Tang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yiqian Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Dehan Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Gongsheng Li</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+X">Xaogang Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00399v1-abstract-short" style="display: inline;"> Style-guided texture generation aims to generate a texture that is harmonious with both the style of the reference image and the geometry of the input mesh, given a reference style image and a 3D mesh with its text description. Although diffusion-based 3D texture generation methods, such as distillation sampling, have numerous promising applications in stylized games and films, it requires address… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00399v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00399v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00399v1-abstract-full" style="display: none;"> Style-guided texture generation aims to generate a texture that is harmonious with both the style of the reference image and the geometry of the input mesh, given a reference style image and a 3D mesh with its text description. Although diffusion-based 3D texture generation methods, such as distillation sampling, have numerous promising applications in stylized games and films, it requires addressing two challenges: 1) decouple style and content completely from the reference image for 3D models, and 2) align the generated texture with the color tone, style of the reference image, and the given text prompt. To this end, we introduce StyleTex, an innovative diffusion-model-based framework for creating stylized textures for 3D models. Our key insight is to decouple style information from the reference image while disregarding content in diffusion-based distillation sampling. Specifically, given a reference image, we first decompose its style feature from the image CLIP embedding by subtracting the embedding's orthogonal projection in the direction of the content feature, which is represented by a text CLIP embedding. Our novel approach to disentangling the reference image's style and content information allows us to generate distinct style and content features. We then inject the style feature into the cross-attention mechanism to incorporate it into the generation process, while utilizing the content feature as a negative prompt to further dissociate content information. Finally, we incorporate these strategies into StyleTex to obtain stylized textures. The resulting textures generated by StyleTex retain the style of the reference image, while also aligning with the text prompts and intrinsic details of the given 3D mesh. Quantitative and qualitative experiments show that our method outperforms existing baseline methods by a significant margin. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00399v1-abstract-full').style.display = 'none'; document.getElementById('2411.00399v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to Siggraph Asia 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.24080">arXiv:2410.24080</a> <span> [<a href="https://arxiv.org/pdf/2410.24080">pdf</a>, <a href="https://arxiv.org/format/2410.24080">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Graph Learning for Numeric Planning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+D+Z">Dillon Z. Chen</a>, <a href="/search/cs?searchtype=author&query=Thi%C3%A9baux%2C+S">Sylvie Thi茅baux</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.24080v1-abstract-short" style="display: inline;"> Graph learning is naturally well suited for use in symbolic, object-centric planning due to its ability to exploit relational structures exhibited in planning domains and to take as input planning instances with arbitrary numbers of objects. Numeric planning is an extension of symbolic planning in which states may now also exhibit numeric variables. In this work, we propose data-efficient and inte… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.24080v1-abstract-full').style.display = 'inline'; document.getElementById('2410.24080v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.24080v1-abstract-full" style="display: none;"> Graph learning is naturally well suited for use in symbolic, object-centric planning due to its ability to exploit relational structures exhibited in planning domains and to take as input planning instances with arbitrary numbers of objects. Numeric planning is an extension of symbolic planning in which states may now also exhibit numeric variables. In this work, we propose data-efficient and interpretable machine learning models for learning to solve numeric planning tasks. This involves constructing a new graph kernel for graphs with both continuous and categorical attributes, as well as new optimisation methods for learning heuristic functions for numeric planning. Experiments show that our graph kernels are vastly more efficient and generalise better than graph neural networks for numeric planning, and also yield competitive coverage performance compared to domain-independent numeric planners. Code is available at https://github.com/DillonZChen/goose <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.24080v1-abstract-full').style.display = 'none'; document.getElementById('2410.24080v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Extended version of NeurIPS 2024 paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20313">arXiv:2410.20313</a> <span> [<a href="https://arxiv.org/pdf/2410.20313">pdf</a>, <a href="https://arxiv.org/format/2410.20313">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Efficient Circuit Wire Cutting Based on Commuting Groups </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+X">Xinpeng Li</a>, <a href="/search/cs?searchtype=author&query=Kulkarni%2C+V">Vinooth Kulkarni</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D+T">Daniel T. Chen</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+Q">Qiang Guan</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+W">Weiwen Jiang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+N">Ning Xie</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S">Shuai Xu</a>, <a href="/search/cs?searchtype=author&query=Chaudhary%2C+V">Vipin Chaudhary</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20313v1-abstract-short" style="display: inline;"> Current quantum devices face challenges when dealing with large circuits due to error rates as circuit size and the number of qubits increase. The circuit wire-cutting technique addresses this issue by breaking down a large circuit into smaller, more manageable subcircuits. However, the exponential increase in the number of subcircuits and the complexity of reconstruction as more cuts are made pos… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20313v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20313v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20313v1-abstract-full" style="display: none;"> Current quantum devices face challenges when dealing with large circuits due to error rates as circuit size and the number of qubits increase. The circuit wire-cutting technique addresses this issue by breaking down a large circuit into smaller, more manageable subcircuits. However, the exponential increase in the number of subcircuits and the complexity of reconstruction as more cuts are made poses a great practical challenge. Inspired by ancilla-assisted quantum process tomography and the MUBs-based grouping technique for simultaneous measurement, we propose a new approach that can reduce subcircuit running overhead. The approach first uses ancillary qubits to transform all quantum input initializations into quantum output measurements. These output measurements are then organized into commuting groups for the purpose of simultaneous measurement, based on MUBs-based grouping. This approach significantly reduces the number of necessary subcircuits as well as the total number of shots. Lastly, we provide numerical experiments to demonstrate the complexity reduction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20313v1-abstract-full').style.display = 'none'; document.getElementById('2410.20313v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted in IEEE International Conference on Quantum Computing and Engineering - QCE24</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18998">arXiv:2410.18998</a> <span> [<a href="https://arxiv.org/pdf/2410.18998">pdf</a>, <a href="https://arxiv.org/format/2410.18998">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Fluid Dynamics">physics.flu-dyn</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DamFormer: Generalizing Morphologies in Dam Break Simulations Using Transformer Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mul%2C+Z">Zhaoyang Mul</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+A">Aoming Liang</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+M">Mingming Ge</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Dashuai Chen</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+D">Dixia Fan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Minyi Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18998v1-abstract-short" style="display: inline;"> The interaction of waves with structural barriers such as dams breaking plays a critical role in flood defense and tsunami disasters. In this work, we explore the dynamic changes in wave surfaces impacting various structural shapes, e.g., circle, triangle, and square, by using deep learning techniques. We introduce the DamFormer, a novel transformer-based model designed to learn and simulate these… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18998v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18998v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18998v1-abstract-full" style="display: none;"> The interaction of waves with structural barriers such as dams breaking plays a critical role in flood defense and tsunami disasters. In this work, we explore the dynamic changes in wave surfaces impacting various structural shapes, e.g., circle, triangle, and square, by using deep learning techniques. We introduce the DamFormer, a novel transformer-based model designed to learn and simulate these complex interactions. The model was trained and tested on simulated data representing the three structural forms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18998v1-abstract-full').style.display = 'none'; document.getElementById('2410.18998v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18035">arXiv:2410.18035</a> <span> [<a href="https://arxiv.org/pdf/2410.18035">pdf</a>, <a href="https://arxiv.org/format/2410.18035">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> MiLoRA: Efficient Mixture of Low-Rank Adaptation for Large Language Models Fine-tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jingfan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yi Zhao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Dan Chen</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+X">Xing Tian</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+H">Huanran Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+W">Wei Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18035v1-abstract-short" style="display: inline;"> Low-rank adaptation (LoRA) and its mixture-of-experts (MOE) variants are highly effective parameter-efficient fine-tuning (PEFT) methods. However, they introduce significant latency in multi-tenant settings due to the LoRA modules and MOE routers added to multiple linear modules in the Transformer layer. To address this issue, we propose Mixture of Low-Rank Adaptation (MiLoRA), a novel and efficie… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18035v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18035v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18035v1-abstract-full" style="display: none;"> Low-rank adaptation (LoRA) and its mixture-of-experts (MOE) variants are highly effective parameter-efficient fine-tuning (PEFT) methods. However, they introduce significant latency in multi-tenant settings due to the LoRA modules and MOE routers added to multiple linear modules in the Transformer layer. To address this issue, we propose Mixture of Low-Rank Adaptation (MiLoRA), a novel and efficient LoRA variant. MiLoRA differs from previous MOE-style LoRA methods by considering each LoRA module as an expert and employing a prompt-aware routing mechanism. This mechanism calculates expert routing results once before generating the first new token and reuses these results for subsequent tokens, reducing latency. Extensive experiments and analysis on commonsense reasoning tasks, math reasoning tasks, and widely used LLM evaluation benchmarks demonstrate that MiLoRA consistently outperforms strong PEFT baselines with comparable tunable parameter budgets. Additionally, MiLoRA significantly reduces latency in multi-tenant settings compared to previous LoRA-based methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18035v1-abstract-full').style.display = 'none'; document.getElementById('2410.18035v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by EMNLP 2024 Findings. arXiv admin note: substantial text overlap with arXiv:2405.18203</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16676">arXiv:2410.16676</a> <span> [<a href="https://arxiv.org/pdf/2410.16676">pdf</a>, <a href="https://arxiv.org/format/2410.16676">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Improving Causal Reasoning in Large Language Models: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+L">Longxuan Yu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Delin Chen</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+S">Siheng Xiong</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Q">Qingyang Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qingzhen Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dawei Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhikai Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaoze Liu</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+L">Liangming Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16676v3-abstract-short" style="display: inline;"> Causal reasoning (CR) is a crucial aspect of intelligence, essential for problem-solving, decision-making, and understanding the world. While large language models (LLMs) can generate rationales for their outputs, their ability to reliably perform causal reasoning remains uncertain, often falling short in tasks requiring a deep understanding of causality. In this survey, we provide a comprehensive… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16676v3-abstract-full').style.display = 'inline'; document.getElementById('2410.16676v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16676v3-abstract-full" style="display: none;"> Causal reasoning (CR) is a crucial aspect of intelligence, essential for problem-solving, decision-making, and understanding the world. While large language models (LLMs) can generate rationales for their outputs, their ability to reliably perform causal reasoning remains uncertain, often falling short in tasks requiring a deep understanding of causality. In this survey, we provide a comprehensive review of research aimed at enhancing LLMs for causal reasoning. We categorize existing methods based on the role of LLMs: either as reasoning engines or as helpers providing knowledge or data to traditional CR methods, followed by a detailed discussion of the methodologies in each category. We then evaluate the performance of LLMs on various causal reasoning tasks, providing key findings and in-depth analysis. Finally, we provide insights from current studies and highlight promising directions for future research. We aim for this work to serve as a comprehensive resource, fostering further advancements in causal reasoning with LLMs. Resources are available at https://github.com/chendl02/Awesome-LLM-causal-reasoning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16676v3-abstract-full').style.display = 'none'; document.getElementById('2410.16676v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13761">arXiv:2410.13761</a> <span> [<a href="https://arxiv.org/pdf/2410.13761">pdf</a>, <a href="https://arxiv.org/format/2410.13761">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> GDeR: Safeguarding Efficiency, Balancing, and Robustness via Prototypical Graph Pruning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Guibin Zhang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+H">Haonan Dong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuchen Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhixun Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Dingshuo Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kai Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tianlong Chen</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yuxuan Liang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+D">Dawei Cheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kun Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13761v1-abstract-short" style="display: inline;"> Training high-quality deep models necessitates vast amounts of data, resulting in overwhelming computational and memory demands. Recently, data pruning, distillation, and coreset selection have been developed to streamline data volume by retaining, synthesizing, or selecting a small yet informative subset from the full set. Among these methods, data pruning incurs the least additional training cos… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13761v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13761v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13761v1-abstract-full" style="display: none;"> Training high-quality deep models necessitates vast amounts of data, resulting in overwhelming computational and memory demands. Recently, data pruning, distillation, and coreset selection have been developed to streamline data volume by retaining, synthesizing, or selecting a small yet informative subset from the full set. Among these methods, data pruning incurs the least additional training cost and offers the most practical acceleration benefits. However, it is the most vulnerable, often suffering significant performance degradation with imbalanced or biased data schema, thus raising concerns about its accuracy and reliability in on-device deployment. Therefore, there is a looming need for a new data pruning paradigm that maintains the efficiency of previous practices while ensuring balance and robustness. Unlike the fields of computer vision and natural language processing, where mature solutions have been developed to address these issues, graph neural networks (GNNs) continue to struggle with increasingly large-scale, imbalanced, and noisy datasets, lacking a unified dataset pruning solution. To achieve this, we introduce a novel dynamic soft-pruning method, GDeR, designed to update the training ``basket'' during the process using trainable prototypes. GDeR first constructs a well-modeled graph embedding hypersphere and then samples \textit{representative, balanced, and unbiased subsets} from this embedding space, which achieves the goal we called Graph Training Debugging. Extensive experiments on five datasets across three GNN backbones, demonstrate that GDeR (I) achieves or surpasses the performance of the full dataset with 30%~50% fewer training samples, (II) attains up to a 2.81x lossless training speedup, and (III) outperforms state-of-the-art pruning methods in imbalanced training and noisy training scenarios by 0.3%~4.3% and 3.6%~7.8%, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13761v1-abstract-full').style.display = 'none'; document.getElementById('2410.13761v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13451">arXiv:2410.13451</a> <span> [<a href="https://arxiv.org/pdf/2410.13451">pdf</a>, <a href="https://arxiv.org/ps/2410.13451">ps</a>, <a href="https://arxiv.org/format/2410.13451">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> </div> </div> <p class="title is-5 mathjax"> Parallel and Distributed Expander Decomposition: Simple, Fast, and Near-Optimal </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+D">Daoyuan Chen</a>, <a href="/search/cs?searchtype=author&query=Meierhans%2C+S">Simon Meierhans</a>, <a href="/search/cs?searchtype=author&query=Gutenberg%2C+M+P">Maximilian Probst Gutenberg</a>, <a href="/search/cs?searchtype=author&query=Saranurak%2C+T">Thatchaphol Saranurak</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13451v1-abstract-short" style="display: inline;"> Expander decompositions have become one of the central frameworks in the design of fast algorithms. For an undirected graph $G=(V,E)$, a near-optimal $蠁$-expander decomposition is a partition $V_1, V_2, \ldots, V_k$ of the vertex set $V$ where each subgraph $G[V_i]$ is a $蠁$-expander, and only an $\widetilde{O}(蠁)$-fraction of the edges cross between partition sets. In this article, we give the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13451v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13451v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13451v1-abstract-full" style="display: none;"> Expander decompositions have become one of the central frameworks in the design of fast algorithms. For an undirected graph $G=(V,E)$, a near-optimal $蠁$-expander decomposition is a partition $V_1, V_2, \ldots, V_k$ of the vertex set $V$ where each subgraph $G[V_i]$ is a $蠁$-expander, and only an $\widetilde{O}(蠁)$-fraction of the edges cross between partition sets. In this article, we give the first near-optimal \emph{parallel} algorithm to compute $蠁$-expander decompositions in near-linear work $\widetilde{O}(m/蠁^2)$ and near-constant span $\widetilde{O}(1/蠁^4)$. Our algorithm is very simple and likely practical. Our algorithm can also be implemented in the distributed Congest model in $\tilde{O}(1/蠁^4)$ rounds. Our results surpass the theoretical guarantees of the current state-of-the-art parallel algorithms [Chang-Saranurak PODC'19, Chang-Saranurak FOCS'20], while being the first to ensure that only an $\tilde{O}(蠁)$ fraction of edges cross between partition sets. In contrast, previous algorithms [Chang-Saranurak PODC'19, Chang-Saranurak FOCS'20] admit at least an $O(蠁^{1/3})$ fraction of crossing edges, a polynomial loss in quality inherent to their random-walk-based techniques. Our algorithm, instead, leverages flow-based techniques and extends the popular sequential algorithm presented in [Saranurak-Wang SODA'19]. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13451v1-abstract-full').style.display = 'none'; document.getElementById('2410.13451v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear at SODA'25</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13043">arXiv:2410.13043</a> <span> [<a href="https://arxiv.org/pdf/2410.13043">pdf</a>, <a href="https://arxiv.org/format/2410.13043">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> UniCoN: Universal Conditional Networks for Multi-Age Embryonic Cartilage Segmentation with Sparsely Annotated Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sapkota%2C+N">Nishchal Sapkota</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yejia Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zihao Zhao</a>, <a href="/search/cs?searchtype=author&query=Gomez%2C+M">Maria Gomez</a>, <a href="/search/cs?searchtype=author&query=Hsi%2C+Y">Yuhan Hsi</a>, <a href="/search/cs?searchtype=author&query=Wilson%2C+J+A">Jordan A. Wilson</a>, <a href="/search/cs?searchtype=author&query=Kawasaki%2C+K">Kazuhiko Kawasaki</a>, <a href="/search/cs?searchtype=author&query=Holmes%2C+G">Greg Holmes</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+M">Meng Wu</a>, <a href="/search/cs?searchtype=author&query=Jabs%2C+E+W">Ethylin Wang Jabs</a>, <a href="/search/cs?searchtype=author&query=Richtsmeier%2C+J+T">Joan T. Richtsmeier</a>, <a href="/search/cs?searchtype=author&query=Perrine%2C+S+M+M">Susan M. Motch Perrine</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D+Z">Danny Z. Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13043v1-abstract-short" style="display: inline;"> Osteochondrodysplasia, affecting 2-3% of newborns globally, is a group of bone and cartilage disorders that often result in head malformations, contributing to childhood morbidity and reduced quality of life. Current research on this disease using mouse models faces challenges since it involves accurately segmenting the developing cartilage in 3D micro-CT images of embryonic mice. Tackling this se… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13043v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13043v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13043v1-abstract-full" style="display: none;"> Osteochondrodysplasia, affecting 2-3% of newborns globally, is a group of bone and cartilage disorders that often result in head malformations, contributing to childhood morbidity and reduced quality of life. Current research on this disease using mouse models faces challenges since it involves accurately segmenting the developing cartilage in 3D micro-CT images of embryonic mice. Tackling this segmentation task with deep learning (DL) methods is laborious due to the big burden of manual image annotation, expensive due to the high acquisition costs of 3D micro-CT images, and difficult due to embryonic cartilage's complex and rapidly changing shapes. While DL approaches have been proposed to automate cartilage segmentation, most such models have limited accuracy and generalizability, especially across data from different embryonic age groups. To address these limitations, we propose novel DL methods that can be adopted by any DL architectures -- including CNNs, Transformers, or hybrid models -- which effectively leverage age and spatial information to enhance model performance. Specifically, we propose two new mechanisms, one conditioned on discrete age categories and the other on continuous image crop locations, to enable an accurate representation of cartilage shape changes across ages and local shape details throughout the cranial region. Extensive experiments on multi-age cartilage segmentation datasets show significant and consistent performance improvements when integrating our conditional modules into popular DL segmentation architectures. On average, we achieve a 1.7% Dice score increase with minimal computational overhead and a 7.5% improvement on unseen data. These results highlight the potential of our approach for developing robust, universal models capable of handling diverse datasets with limited annotated data, a key challenge in DL-based medical image analysis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13043v1-abstract-full').style.display = 'none'; document.getElementById('2410.13043v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12983">arXiv:2410.12983</a> <span> [<a href="https://arxiv.org/pdf/2410.12983">pdf</a>, <a href="https://arxiv.org/format/2410.12983">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Reinforcement Learning with Euclidean Data Augmentation for State-Based Continuous Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+J">Jinzhu Luo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Dingyang Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qi Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12983v1-abstract-short" style="display: inline;"> Data augmentation creates new data points by transforming the original ones for a reinforcement learning (RL) agent to learn from, which has been shown to be effective for the objective of improving the data efficiency of RL for continuous control. Prior work towards this objective has been largely restricted to perturbation-based data augmentation where new data points are created by perturbing t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12983v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12983v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12983v1-abstract-full" style="display: none;"> Data augmentation creates new data points by transforming the original ones for a reinforcement learning (RL) agent to learn from, which has been shown to be effective for the objective of improving the data efficiency of RL for continuous control. Prior work towards this objective has been largely restricted to perturbation-based data augmentation where new data points are created by perturbing the original ones, which has been impressively effective for tasks where the RL agent observes control states as images with perturbations including random cropping, shifting, etc. This work focuses on state-based control, where the RL agent can directly observe raw kinematic and task features, and considers an alternative data augmentation applied to these features based on Euclidean symmetries under transformations like rotations. We show that the default state features used in exiting benchmark tasks that are based on joint configurations are not amenable to Euclidean transformations. We therefore advocate using state features based on configurations of the limbs (i.e., the rigid bodies connected by the joints) that instead provide rich augmented data under Euclidean transformations. With minimal hyperparameter tuning, we show this new Euclidean data augmentation strategy significantly improves both data efficiency and asymptotic performance of RL on a wide range of continuous control tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12983v1-abstract-full').style.display = 'none'; document.getElementById('2410.12983v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12696">arXiv:2410.12696</a> <span> [<a href="https://arxiv.org/pdf/2410.12696">pdf</a>, <a href="https://arxiv.org/format/2410.12696">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> AdaptiveDrag: Semantic-Driven Dragging on Diffusion-Based Image Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+D">DuoSheng Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Binghui Chen</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+Y">Yifeng Geng</a>, <a href="/search/cs?searchtype=author&query=Bo%2C+L">Liefeng Bo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12696v1-abstract-short" style="display: inline;"> Recently, several point-based image editing methods (e.g., DragDiffusion, FreeDrag, DragNoise) have emerged, yielding precise and high-quality results based on user instructions. However, these methods often make insufficient use of semantic information, leading to less desirable results. In this paper, we proposed a novel mask-free point-based image editing method, AdaptiveDrag, which provides a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12696v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12696v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12696v1-abstract-full" style="display: none;"> Recently, several point-based image editing methods (e.g., DragDiffusion, FreeDrag, DragNoise) have emerged, yielding precise and high-quality results based on user instructions. However, these methods often make insufficient use of semantic information, leading to less desirable results. In this paper, we proposed a novel mask-free point-based image editing method, AdaptiveDrag, which provides a more flexible editing approach and generates images that better align with user intent. Specifically, we design an auto mask generation module using super-pixel division for user-friendliness. Next, we leverage a pre-trained diffusion model to optimize the latent, enabling the dragging of features from handle points to target points. To ensure a comprehensive connection between the input image and the drag process, we have developed a semantic-driven optimization. We design adaptive steps that are supervised by the positions of the points and the semantic regions derived from super-pixel segmentation. This refined optimization process also leads to more realistic and accurate drag results. Furthermore, to address the limitations in the generative consistency of the diffusion model, we introduce an innovative corresponding loss during the sampling process. Building on these effective designs, our method delivers superior generation results using only the single input image and the handle-target point pairs. Extensive experiments have been conducted and demonstrate that the proposed method outperforms others in handling various drag instructions (e.g., resize, movement, extension) across different domains (e.g., animals, human face, land space, clothing). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12696v1-abstract-full').style.display = 'none'; document.getElementById('2410.12696v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11815">arXiv:2410.11815</a> <span> [<a href="https://arxiv.org/pdf/2410.11815">pdf</a>, <a href="https://arxiv.org/format/2410.11815">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SGEdit: Bridging LLM with Text2Image Generative Model for Scene Graph-based Image Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhiyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">DongDong Chen</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+J">Jing Liao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11815v1-abstract-short" style="display: inline;"> Scene graphs offer a structured, hierarchical representation of images, with nodes and edges symbolizing objects and the relationships among them. It can serve as a natural interface for image editing, dramatically improving precision and flexibility. Leveraging this benefit, we introduce a new framework that integrates large language model (LLM) with Text2Image generative model for scene graph-ba… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11815v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11815v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11815v1-abstract-full" style="display: none;"> Scene graphs offer a structured, hierarchical representation of images, with nodes and edges symbolizing objects and the relationships among them. It can serve as a natural interface for image editing, dramatically improving precision and flexibility. Leveraging this benefit, we introduce a new framework that integrates large language model (LLM) with Text2Image generative model for scene graph-based image editing. This integration enables precise modifications at the object level and creative recomposition of scenes without compromising overall image integrity. Our approach involves two primary stages: 1) Utilizing a LLM-driven scene parser, we construct an image's scene graph, capturing key objects and their interrelationships, as well as parsing fine-grained attributes such as object masks and descriptions. These annotations facilitate concept learning with a fine-tuned diffusion model, representing each object with an optimized token and detailed description prompt. 2) During the image editing phase, a LLM editing controller guides the edits towards specific areas. These edits are then implemented by an attention-modulated diffusion editor, utilizing the fine-tuned model to perform object additions, deletions, replacements, and adjustments. Through extensive experiments, we demonstrate that our framework significantly outperforms existing image editing methods in terms of editing precision and scene aesthetics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11815v1-abstract-full').style.display = 'none'; document.getElementById('2410.11815v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACM Transactions on Graphics and SIGGRAPH Asia 2024. Project page: https://bestzzhang.github.io/SGEdit</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10872">arXiv:2410.10872</a> <span> [<a href="https://arxiv.org/pdf/2410.10872">pdf</a>, <a href="https://arxiv.org/format/2410.10872">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> ToolBridge: An Open-Source Dataset to Equip LLMs with External Tool Capabilities </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jin%2C+Z">Zhenchao Jin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Mengchen Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Dongdong Chen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+L">Lingting Zhu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yunsheng Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+L">Lequan Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10872v1-abstract-short" style="display: inline;"> Through the integration of external tools, large language models (LLMs) such as GPT-4o and Llama 3.1 significantly expand their functional capabilities, evolving from elementary conversational agents to general-purpose assistants. We argue that the primary drivers of these advancements are the quality and diversity of the training data. However, the existing LLMs with external tool integration pro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10872v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10872v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10872v1-abstract-full" style="display: none;"> Through the integration of external tools, large language models (LLMs) such as GPT-4o and Llama 3.1 significantly expand their functional capabilities, evolving from elementary conversational agents to general-purpose assistants. We argue that the primary drivers of these advancements are the quality and diversity of the training data. However, the existing LLMs with external tool integration provide only limited transparency regarding their datasets and data collection methods, which has led to the initiation of this research. Specifically, in this paper, our objective is to elucidate the detailed process involved in constructing datasets that empower LLMs to effectively learn how to utilize external tools and make this information available to the public through the introduction of ToolBridge. ToolBridge proposes to employ a collection of general open-access datasets as its raw dataset pool and applies a series of strategies to identify appropriate data entries from the pool for external tool API insertions. By supervised fine-tuning on these curated data entries, LLMs can invoke external tools in appropriate contexts to boost their predictive accuracy, particularly for basic functions including data processing, numerical computation, and factual retrieval. Our experiments rigorously isolates model architectures and training configurations, focusing exclusively on the role of data. The experimental results indicate that LLMs trained on ToolBridge demonstrate consistent performance improvements on both standard benchmarks and custom evaluation datasets. All the associated code and data will be open-source at https://github.com/CharlesPikachu/ToolBridge, promoting transparency and facilitating the broader community to explore approaches for equipping LLMs with external tools capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10872v1-abstract-full').style.display = 'none'; document.getElementById('2410.10872v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">technical report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09836">arXiv:2410.09836</a> <span> [<a href="https://arxiv.org/pdf/2410.09836">pdf</a>, <a href="https://arxiv.org/format/2410.09836">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Learning Pattern-Specific Experts for Time Series Forecasting Under Patch-level Distribution Shift </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yanru Sun</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Z">Zongxia Xie</a>, <a href="/search/cs?searchtype=author&query=Eldele%2C+E">Emadeldeen Eldele</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Dongyue Chen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Q">Qinghua Hu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+M">Min Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09836v1-abstract-short" style="display: inline;"> Time series forecasting, which aims to predict future values based on historical data, has garnered significant attention due to its broad range of applications. However, real-world time series often exhibit complex non-uniform distribution with varying patterns across segments, such as season, operating condition, or semantic meaning, making accurate forecasting challenging. Existing approaches,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09836v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09836v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09836v1-abstract-full" style="display: none;"> Time series forecasting, which aims to predict future values based on historical data, has garnered significant attention due to its broad range of applications. However, real-world time series often exhibit complex non-uniform distribution with varying patterns across segments, such as season, operating condition, or semantic meaning, making accurate forecasting challenging. Existing approaches, which typically train a single model to capture all these diverse patterns, often struggle with the pattern drifts between patches and may lead to poor generalization. To address these challenges, we propose \textbf{TFPS}, a novel architecture that leverages pattern-specific experts for more accurate and adaptable time series forecasting. TFPS employs a dual-domain encoder to capture both time-domain and frequency-domain features, enabling a more comprehensive understanding of temporal dynamics. It then uses subspace clustering to dynamically identify distinct patterns across data patches. Finally, pattern-specific experts model these unique patterns, delivering tailored predictions for each patch. By explicitly learning and adapting to evolving patterns, TFPS achieves significantly improved forecasting accuracy. Extensive experiments on real-world datasets demonstrate that TFPS outperforms state-of-the-art methods, particularly in long-term forecasting, through its dynamic and pattern-aware learning approach. The data and codes are available: \url{https://github.com/syrGitHub/TFPS}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09836v1-abstract-full').style.display = 'none'; document.getElementById('2410.09836v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08847">arXiv:2410.08847</a> <span> [<a href="https://arxiv.org/pdf/2410.08847">pdf</a>, <a href="https://arxiv.org/format/2410.08847">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Unintentional Unalignment: Likelihood Displacement in Direct Preference Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Razin%2C+N">Noam Razin</a>, <a href="/search/cs?searchtype=author&query=Malladi%2C+S">Sadhika Malladi</a>, <a href="/search/cs?searchtype=author&query=Bhaskar%2C+A">Adithya Bhaskar</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Danqi Chen</a>, <a href="/search/cs?searchtype=author&query=Arora%2C+S">Sanjeev Arora</a>, <a href="/search/cs?searchtype=author&query=Hanin%2C+B">Boris Hanin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08847v2-abstract-short" style="display: inline;"> Direct Preference Optimization (DPO) and its variants are increasingly used for aligning language models with human preferences. Although these methods are designed to teach a model to generate preferred responses more frequently relative to dispreferred responses, prior work has observed that the likelihood of preferred responses often decreases during training. The current work sheds light on th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08847v2-abstract-full').style.display = 'inline'; document.getElementById('2410.08847v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08847v2-abstract-full" style="display: none;"> Direct Preference Optimization (DPO) and its variants are increasingly used for aligning language models with human preferences. Although these methods are designed to teach a model to generate preferred responses more frequently relative to dispreferred responses, prior work has observed that the likelihood of preferred responses often decreases during training. The current work sheds light on the causes and implications of this counter-intuitive phenomenon, which we term likelihood displacement. We demonstrate that likelihood displacement can be catastrophic, shifting probability mass from preferred responses to responses with an opposite meaning. As a simple example, training a model to prefer $\texttt{No}$ over $\texttt{Never}$ can sharply increase the probability of $\texttt{Yes}$. Moreover, when aligning the model to refuse unsafe prompts, we show that such displacement can unintentionally lead to unalignment, by shifting probability mass from preferred refusal responses to harmful responses (e.g., reducing the refusal rate of Llama-3-8B-Instruct from 74.4% to 33.4%). We theoretically characterize that likelihood displacement is driven by preferences that induce similar embeddings, as measured by a centered hidden embedding similarity (CHES) score. Empirically, the CHES score enables identifying which training samples contribute most to likelihood displacement in a given dataset. Filtering out these samples effectively mitigated unintentional unalignment in our experiments. More broadly, our results highlight the importance of curating data with sufficiently distinct preferences, for which we believe the CHES score may prove valuable. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08847v2-abstract-full').style.display = 'none'; document.getElementById('2410.08847v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code available at https://github.com/princeton-nlp/unintentional-unalignment</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08068">arXiv:2410.08068</a> <span> [<a href="https://arxiv.org/pdf/2410.08068">pdf</a>, <a href="https://arxiv.org/format/2410.08068">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Teaching-Inspired Integrated Prompting Framework: A Novel Approach for Enhancing Reasoning in Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+W">Wenting Tan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Dongxiao Chen</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+J">Jieting Xue</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zihao Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Taijie Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08068v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) exhibit impressive performance across various domains but still struggle with arithmetic reasoning tasks. Recent work shows the effectiveness of prompt design methods in enhancing reasoning capabilities. However, these approaches overlook crucial requirements for prior knowledge of specific concepts, theorems, and tricks to tackle most arithmetic reasoning problems suc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08068v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08068v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08068v1-abstract-full" style="display: none;"> Large Language Models (LLMs) exhibit impressive performance across various domains but still struggle with arithmetic reasoning tasks. Recent work shows the effectiveness of prompt design methods in enhancing reasoning capabilities. However, these approaches overlook crucial requirements for prior knowledge of specific concepts, theorems, and tricks to tackle most arithmetic reasoning problems successfully. To address this issue, we propose a novel and effective Teaching-Inspired Integrated Framework, which emulates the instructional process of a teacher guiding students. This method equips LLMs with essential concepts, relevant theorems, and similar problems with analogous solution approaches, facilitating the enhancement of reasoning abilities. Additionally, we introduce two new Chinese datasets, MathMC and MathToF, both with detailed explanations and answers. Experiments are conducted on nine benchmarks which demonstrates that our approach improves the reasoning accuracy of LLMs. With GPT-4 and our framework, we achieve new state-of-the-art performance on four math benchmarks (AddSub, SVAMP, Math23K and AQuA) with accuracies of 98.2% (+3.3%), 93.9% (+0.2%), 94.3% (+7.2%) and 81.1% (+1.2%). Our data and code are available at https://github.com/SallyTan13/Teaching-Inspired-Prompting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08068v1-abstract-full').style.display = 'none'; document.getElementById('2410.08068v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07923">arXiv:2410.07923</a> <span> [<a href="https://arxiv.org/pdf/2410.07923">pdf</a>, <a href="https://arxiv.org/format/2410.07923">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Deep Learning for Generalised Planning with Background Knowledge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+D+Z">Dillon Z. Chen</a>, <a href="/search/cs?searchtype=author&query=Hor%C4%8D%C3%ADk%2C+R">Rostislav Hor膷铆k</a>, <a href="/search/cs?searchtype=author&query=%C5%A0%C3%ADr%2C+G">Gustav 艩铆r</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07923v1-abstract-short" style="display: inline;"> Automated planning is a form of declarative problem solving which has recently drawn attention from the machine learning (ML) community. ML has been applied to planning either as a way to test `reasoning capabilities' of architectures, or more pragmatically in an attempt to scale up solvers with learned domain knowledge. In practice, planning problems are easy to solve but hard to optimise. Howeve… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07923v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07923v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07923v1-abstract-full" style="display: none;"> Automated planning is a form of declarative problem solving which has recently drawn attention from the machine learning (ML) community. ML has been applied to planning either as a way to test `reasoning capabilities' of architectures, or more pragmatically in an attempt to scale up solvers with learned domain knowledge. In practice, planning problems are easy to solve but hard to optimise. However, ML approaches still struggle to solve many problems that are often easy for both humans and classical planners. In this paper, we thus propose a new ML approach that allows users to specify background knowledge (BK) through Datalog rules to guide both the learning and planning processes in an integrated fashion. By incorporating BK, our approach bypasses the need to relearn how to solve problems from scratch and instead focuses the learning on plan quality optimisation. Experiments with BK demonstrate that our method successfully scales and learns to plan efficiently with high quality solutions from small training data generated in under 5 seconds. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07923v1-abstract-full').style.display = 'none'; document.getElementById('2410.07923v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06194">arXiv:2410.06194</a> <span> [<a href="https://arxiv.org/pdf/2410.06194">pdf</a>, <a href="https://arxiv.org/format/2410.06194">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Prompting DirectSAM for Semantic Contour Extraction in Remote Sensing Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Miao%2C+S">Shiyu Miao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Delong Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+F">Fan Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chuanyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yanhui Gu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shengjie Guo</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jun Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06194v1-abstract-short" style="display: inline;"> The Direct Segment Anything Model (DirectSAM) excels in class-agnostic contour extraction. In this paper, we explore its use by applying it to optical remote sensing imagery, where semantic contour extraction-such as identifying buildings, road networks, and coastlines-holds significant practical value. Those applications are currently handled via training specialized small models separately on sm… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06194v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06194v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06194v1-abstract-full" style="display: none;"> The Direct Segment Anything Model (DirectSAM) excels in class-agnostic contour extraction. In this paper, we explore its use by applying it to optical remote sensing imagery, where semantic contour extraction-such as identifying buildings, road networks, and coastlines-holds significant practical value. Those applications are currently handled via training specialized small models separately on small datasets in each domain. We introduce a foundation model derived from DirectSAM, termed DirectSAM-RS, which not only inherits the strong segmentation capability acquired from natural images, but also benefits from a large-scale dataset we created for remote sensing semantic contour extraction. This dataset comprises over 34k image-text-contour triplets, making it at least 30 times larger than individual dataset. DirectSAM-RS integrates a prompter module: a text encoder and cross-attention layers attached to the DirectSAM architecture, which allows flexible conditioning on target class labels or referring expressions. We evaluate the DirectSAM-RS in both zero-shot and fine-tuning setting, and demonstrate that it achieves state-of-the-art performance across several downstream benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06194v1-abstract-full').style.display = 'none'; document.getElementById('2410.06194v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03930">arXiv:2410.03930</a> <span> [<a href="https://arxiv.org/pdf/2410.03930">pdf</a>, <a href="https://arxiv.org/ps/2410.03930">ps</a>, <a href="https://arxiv.org/format/2410.03930">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Reverb: Open-Source ASR and Diarization from Rev </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bhandari%2C+N">Nishchal Bhandari</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Danny Chen</a>, <a href="/search/cs?searchtype=author&query=Fern%C3%A1ndez%2C+M+%C3%81+d+R">Miguel 脕ngel del R铆o Fern谩ndez</a>, <a href="/search/cs?searchtype=author&query=Delworth%2C+N">Natalie Delworth</a>, <a href="/search/cs?searchtype=author&query=Fox%2C+J+D">Jennifer Drexler Fox</a>, <a href="/search/cs?searchtype=author&query=Jett%C3%A9%2C+M">Mig眉el Jett茅</a>, <a href="/search/cs?searchtype=author&query=McNamara%2C+Q">Quinten McNamara</a>, <a href="/search/cs?searchtype=author&query=Miller%2C+C">Corey Miller</a>, <a href="/search/cs?searchtype=author&query=Novotn%C3%BD%2C+O">Ond艡ej Novotn媒</a>, <a href="/search/cs?searchtype=author&query=Profant%2C+J">J谩n Profant</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+N">Nan Qin</a>, <a href="/search/cs?searchtype=author&query=Ratajczak%2C+M">Martin Ratajczak</a>, <a href="/search/cs?searchtype=author&query=Robichaud%2C+J">Jean-Philippe Robichaud</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03930v1-abstract-short" style="display: inline;"> Today, we are open-sourcing our core speech recognition and diarization models for non-commercial use. We are releasing both a full production pipeline for developers as well as pared-down research models for experimentation. Rev hopes that these releases will spur research and innovation in the fast-moving domain of voice technology. The speech recognition models released today outperform all exi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03930v1-abstract-full').style.display = 'inline'; document.getElementById('2410.03930v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03930v1-abstract-full" style="display: none;"> Today, we are open-sourcing our core speech recognition and diarization models for non-commercial use. We are releasing both a full production pipeline for developers as well as pared-down research models for experimentation. Rev hopes that these releases will spur research and innovation in the fast-moving domain of voice technology. The speech recognition models released today outperform all existing open source speech recognition models across a variety of long-form speech recognition domains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03930v1-abstract-full').style.display = 'none'; document.getElementById('2410.03930v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03755">arXiv:2410.03755</a> <span> [<a href="https://arxiv.org/pdf/2410.03755">pdf</a>, <a href="https://arxiv.org/format/2410.03755">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Denoising with a Joint-Embedding Predictive Architecture </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+D">Dengsheng Chen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jie Hu</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+X">Xiaoming Wei</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+E">Enhua Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03755v1-abstract-short" style="display: inline;"> Joint-embedding predictive architectures (JEPAs) have shown substantial promise in self-supervised representation learning, yet their application in generative modeling remains underexplored. Conversely, diffusion models have demonstrated significant efficacy in modeling arbitrary probability distributions. In this paper, we introduce Denoising with a Joint-Embedding Predictive Architecture (D-JEP… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03755v1-abstract-full').style.display = 'inline'; document.getElementById('2410.03755v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03755v1-abstract-full" style="display: none;"> Joint-embedding predictive architectures (JEPAs) have shown substantial promise in self-supervised representation learning, yet their application in generative modeling remains underexplored. Conversely, diffusion models have demonstrated significant efficacy in modeling arbitrary probability distributions. In this paper, we introduce Denoising with a Joint-Embedding Predictive Architecture (D-JEPA), pioneering the integration of JEPA within generative modeling. By recognizing JEPA as a form of masked image modeling, we reinterpret it as a generalized next-token prediction strategy, facilitating data generation in an auto-regressive manner. Furthermore, we incorporate diffusion loss to model the per-token probability distribution, enabling data generation in a continuous space. We also adapt flow matching loss as an alternative to diffusion loss, thereby enhancing the flexibility of D-JEPA. Empirically, with increased GFLOPs, D-JEPA consistently achieves lower FID scores with fewer training epochs, indicating its good scalability. Our base, large, and huge models outperform all previous generative models across all scales on class-conditional ImageNet benchmarks. Beyond image generation, D-JEPA is well-suited for other continuous data modeling, including video and audio. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03755v1-abstract-full').style.display = 'none'; document.getElementById('2410.03755v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">38 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03086">arXiv:2410.03086</a> <span> [<a href="https://arxiv.org/pdf/2410.03086">pdf</a>, <a href="https://arxiv.org/format/2410.03086">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Design and Evaluation of a Compliant Quasi Direct Drive End-effector for Safe Robotic Ultrasound Imaging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+D">Danyi Chen</a>, <a href="/search/cs?searchtype=author&query=Prakash%2C+R">Ravi Prakash</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zacharias Chen</a>, <a href="/search/cs?searchtype=author&query=Dias%2C+S">Sarah Dias</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+V">Vincent Wang</a>, <a href="/search/cs?searchtype=author&query=Bridgeman%2C+L">Leila Bridgeman</a>, <a href="/search/cs?searchtype=author&query=Oca%2C+S">Siobhan Oca</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03086v1-abstract-short" style="display: inline;"> Robot-assisted ultrasound scanning promises to advance autonomous and accessible medical imaging. However, ensuring patient safety and compliant human-robot interaction (HRI) during probe contact poses a significant challenge. Most existing systems either have high mechanical stiffness or are compliant but lack sufficient force and precision. This paper presents a novel single-degree-of-freedom en… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03086v1-abstract-full').style.display = 'inline'; document.getElementById('2410.03086v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03086v1-abstract-full" style="display: none;"> Robot-assisted ultrasound scanning promises to advance autonomous and accessible medical imaging. However, ensuring patient safety and compliant human-robot interaction (HRI) during probe contact poses a significant challenge. Most existing systems either have high mechanical stiffness or are compliant but lack sufficient force and precision. This paper presents a novel single-degree-of-freedom end-effector for safe and accurate robotic ultrasound imaging, using a quasi-direct drive actuator to achieve both passive mechanical compliance and precise active force regulation, even during motion. The end-effector demonstrates an effective force control bandwidth of 100 Hz and can apply forces ranging from 2.5N to 15N. To validate the end-effector's performance, we developed a novel ex vivo actuating platform, enabling compliance testing of the end-effector on simulated abdominal breathing and sudden patient movements. Experiments demonstrate that the end-effector can maintain consistent probe contact during simulated respiratory motion at 2.5N, 5N, 10N, and 15N, with an average force tracking RMS error of 0.83N compared to 4.70N on a UR3e robot arm using conventional force control. This system represents the first compliant ultrasound end-effector tested on a tissue platform simulating dynamic movement. The proposed solution provides a novel approach for designing and evaluating compliant robotic ultrasound systems, advancing the path for more compliant and patient-friendly robotic ultrasound systems in clinical settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03086v1-abstract-full').style.display = 'none'; document.getElementById('2410.03086v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02736">arXiv:2410.02736</a> <span> [<a href="https://arxiv.org/pdf/2410.02736">pdf</a>, <a href="https://arxiv.org/format/2410.02736">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Justice or Prejudice? Quantifying Biases in LLM-as-a-Judge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jiayi Ye</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yanbo Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yue Huang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Dongping Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qihui Zhang</a>, <a href="/search/cs?searchtype=author&query=Moniz%2C+N">Nuno Moniz</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+T">Tian Gao</a>, <a href="/search/cs?searchtype=author&query=Geyer%2C+W">Werner Geyer</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+C">Chao Huang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+P">Pin-Yu Chen</a>, <a href="/search/cs?searchtype=author&query=Chawla%2C+N+V">Nitesh V Chawla</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiangliang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02736v2-abstract-short" style="display: inline;"> LLM-as-a-Judge has been widely utilized as an evaluation method in various benchmarks and served as supervised rewards in model training. However, despite their excellence in many domains, potential issues are under-explored, undermining their reliability and the scope of their utility. Therefore, we identify 12 key potential biases and propose a new automated bias quantification framework-CALM-wh… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02736v2-abstract-full').style.display = 'inline'; document.getElementById('2410.02736v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02736v2-abstract-full" style="display: none;"> LLM-as-a-Judge has been widely utilized as an evaluation method in various benchmarks and served as supervised rewards in model training. However, despite their excellence in many domains, potential issues are under-explored, undermining their reliability and the scope of their utility. Therefore, we identify 12 key potential biases and propose a new automated bias quantification framework-CALM-which systematically quantifies and analyzes each type of bias in LLM-as-a-Judge by using automated and principle-guided modification. Our experiments cover multiple popular language models, and the results indicate that while advanced models have achieved commendable overall performance, significant biases persist in certain specific tasks. Empirical results suggest that there remains room for improvement in the reliability of LLM-as-a-Judge. Moreover, we also discuss the explicit and implicit influence of these biases and give some suggestions for the reliable application of LLM-as-a-Judge. Our work highlights the need for stakeholders to address these issues and remind users to exercise caution in LLM-as-a-Judge applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02736v2-abstract-full').style.display = 'none'; document.getElementById('2410.02736v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02694">arXiv:2410.02694</a> <span> [<a href="https://arxiv.org/pdf/2410.02694">pdf</a>, <a href="https://arxiv.org/format/2410.02694">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> HELMET: How to Evaluate Long-Context Language Models Effectively and Thoroughly </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yen%2C+H">Howard Yen</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+T">Tianyu Gao</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+M">Minmin Hou</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+K">Ke Ding</a>, <a href="/search/cs?searchtype=author&query=Fleischer%2C+D">Daniel Fleischer</a>, <a href="/search/cs?searchtype=author&query=Izsak%2C+P">Peter Izsak</a>, <a href="/search/cs?searchtype=author&query=Wasserblat%2C+M">Moshe Wasserblat</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Danqi Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02694v2-abstract-short" style="display: inline;"> There have been many benchmarks for evaluating long-context language models (LCLMs), but developers often rely on synthetic tasks like needle-in-a-haystack (NIAH) or arbitrary subsets of tasks. It remains unclear whether they translate to the diverse downstream applications of LCLMs, and the inconsistency further complicates model comparison. We investigate the underlying reasons behind current pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02694v2-abstract-full').style.display = 'inline'; document.getElementById('2410.02694v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02694v2-abstract-full" style="display: none;"> There have been many benchmarks for evaluating long-context language models (LCLMs), but developers often rely on synthetic tasks like needle-in-a-haystack (NIAH) or arbitrary subsets of tasks. It remains unclear whether they translate to the diverse downstream applications of LCLMs, and the inconsistency further complicates model comparison. We investigate the underlying reasons behind current practices and find that existing benchmarks often provide noisy signals due to low coverage of applications, insufficient lengths, unreliable metrics, and incompatibility with base models. In this work, we present HELMET (How to Evaluate Long-context Models Effectively and Thoroughly), a comprehensive benchmark encompassing seven diverse, application-centric categories. We also address many issues in previous benchmarks by adding controllable lengths up to 128k tokens, model-based evaluation for reliable metrics, and few-shot prompting for robustly evaluating base models. Consequently, we demonstrate that HELMET offers more reliable and consistent rankings of frontier LCLMs. Through a comprehensive study of 51 LCLMs, we find that (1) synthetic tasks like NIAH are not good predictors of downstream performance; (2) the diverse categories in HELMET exhibit distinct trends and low correlation with each other; and (3) while most LCLMs achieve perfect NIAH scores, open-source models significantly lag behind closed ones when the task requires full-context reasoning or following complex instructions -- the gap widens with increased lengths. Finally, we recommend using our RAG tasks for fast model development, as they are easy to run and more predictive of other downstream performance; ultimately, we advocate for a holistic evaluation across diverse tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02694v2-abstract-full').style.display = 'none'; document.getElementById('2410.02694v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code and data are available here: https://github.com/princeton-nlp/HELMET</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02660">arXiv:2410.02660</a> <span> [<a href="https://arxiv.org/pdf/2410.02660">pdf</a>, <a href="https://arxiv.org/format/2410.02660">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> How to Train Long-Context Language Models (Effectively) </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+T">Tianyu Gao</a>, <a href="/search/cs?searchtype=author&query=Wettig%2C+A">Alexander Wettig</a>, <a href="/search/cs?searchtype=author&query=Yen%2C+H">Howard Yen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Danqi Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02660v1-abstract-short" style="display: inline;"> We study continued training and supervised fine-tuning (SFT) of a language model (LM) to make effective use of long-context information. We first establish a reliable evaluation protocol to guide model development -- Instead of perplexity or simple needle-in-a-haystack (NIAH) tests, we use a broad set of long-context tasks, and we evaluate models after SFT with instruction data as this better reve… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02660v1-abstract-full').style.display = 'inline'; document.getElementById('2410.02660v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02660v1-abstract-full" style="display: none;"> We study continued training and supervised fine-tuning (SFT) of a language model (LM) to make effective use of long-context information. We first establish a reliable evaluation protocol to guide model development -- Instead of perplexity or simple needle-in-a-haystack (NIAH) tests, we use a broad set of long-context tasks, and we evaluate models after SFT with instruction data as this better reveals long-context abilities. Supported by our robust evaluations, we run thorough experiments to decide the data mix for continued pre-training, the instruction tuning dataset, and many other design choices. We find that (1) code repositories and books are excellent sources of long data, but it is crucial to combine them with high-quality short data; (2) training with a sequence length beyond the evaluation length boosts long-context performance; (3) for SFT, using only short instruction datasets yields strong performance on long-context tasks. Our final model, ProLong-8B, which is initialized from Llama-3 and trained on 40B tokens, demonstrates state-of-the-art long-context performance among similarly sized models at a length of 128K. ProLong outperforms Llama-3.18B-Instruct on the majority of long-context tasks despite having seen only 5% as many tokens during long-context training. Additionally, ProLong can effectively process up to 512K tokens, one of the longest context windows of publicly available LMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02660v1-abstract-full').style.display = 'none'; document.getElementById('2410.02660v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Our code, data, and models are available at https://github.com/princeton-nlp/ProLong</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19681">arXiv:2409.19681</a> <span> [<a href="https://arxiv.org/pdf/2409.19681">pdf</a>, <a href="https://arxiv.org/format/2409.19681">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Simple and Fast Distillation of Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zhenyu Zhou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Defang Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Can Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chun Chen</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+S">Siwei Lyu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19681v1-abstract-short" style="display: inline;"> Diffusion-based generative models have demonstrated their powerful performance across various tasks, but this comes at a cost of the slow sampling speed. To achieve both efficient and high-quality synthesis, various distillation-based accelerated sampling methods have been developed recently. However, they generally require time-consuming fine tuning with elaborate designs to achieve satisfactory… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19681v1-abstract-full').style.display = 'inline'; document.getElementById('2409.19681v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19681v1-abstract-full" style="display: none;"> Diffusion-based generative models have demonstrated their powerful performance across various tasks, but this comes at a cost of the slow sampling speed. To achieve both efficient and high-quality synthesis, various distillation-based accelerated sampling methods have been developed recently. However, they generally require time-consuming fine tuning with elaborate designs to achieve satisfactory performance in a specific number of function evaluation (NFE), making them difficult to employ in practice. To address this issue, we propose Simple and Fast Distillation (SFD) of diffusion models, which simplifies the paradigm used in existing methods and largely shortens their fine-tuning time up to 1000$\times$. We begin with a vanilla distillation-based sampling method and boost its performance to state of the art by identifying and addressing several small yet vital factors affecting the synthesis efficiency and quality. Our method can also achieve sampling with variable NFEs using a single distilled model. Extensive experiments demonstrate that SFD strikes a good balance between the sample quality and fine-tuning costs in few-step image generation task. For example, SFD achieves 4.53 FID (NFE=2) on CIFAR-10 with only 0.64 hours of fine-tuning on a single NVIDIA A100 GPU. Our code is available at https://github.com/zju-pi/diff-sampler. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19681v1-abstract-full').style.display = 'none'; document.getElementById('2409.19681v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Chen%2C+D&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Chen%2C+D&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+D&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+D&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+D&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+D&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository