CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 1,059 results for author: <span class="mathjax">Zhao, S</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Zhao%2C+S">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Zhao, S"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Zhao%2C+S&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Zhao, S"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhao%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhao%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhao%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhao%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhao%2C+S&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhao%2C+S&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09597">arXiv:2502.09597</a> <span> [<a href="https://arxiv.org/pdf/2502.09597">pdf</a>, <a href="https://arxiv.org/format/2502.09597">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Do LLMs Recognize Your Preferences? Evaluating Personalized Preference Following in LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Siyan Zhao</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+M">Mingyi Hong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Hazarika%2C+D">Devamanyu Hazarika</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+K">Kaixiang Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09597v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) are increasingly used as chatbots, yet their ability to personalize responses to user preferences remains limited. We introduce PrefEval, a benchmark for evaluating LLMs' ability to infer, memorize and adhere to user preferences in a long-context conversational setting. PrefEval comprises 3,000 manually curated user preference and query pairs spanning 20 topics. PrefEv… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09597v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09597v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09597v1-abstract-full" style="display: none;"> Large Language Models (LLMs) are increasingly used as chatbots, yet their ability to personalize responses to user preferences remains limited. We introduce PrefEval, a benchmark for evaluating LLMs' ability to infer, memorize and adhere to user preferences in a long-context conversational setting. PrefEval comprises 3,000 manually curated user preference and query pairs spanning 20 topics. PrefEval contains user personalization or preference information in both explicit and implicit forms, and evaluates LLM performance using a generation and a classification task. With PrefEval, we evaluated the aforementioned preference following capabilities of 10 open-source and proprietary LLMs in multi-session conversations with varying context lengths up to 100k tokens. We benchmark with various prompting, iterative feedback, and retrieval-augmented generation methods. Our benchmarking effort reveals that state-of-the-art LLMs face significant challenges in proactively following users' preferences during conversations. In particular, in zero-shot settings, preference following accuracy falls below 10% at merely 10 turns (~3k tokens) across most evaluated models. Even with advanced prompting and retrieval methods, preference following still deteriorates in long-context conversations. Furthermore, we show that fine-tuning on PrefEval significantly improves performance. We believe PrefEval serves as a valuable resource for measuring, understanding, and enhancing LLMs' preference following abilities, paving the way for personalized conversational agents. Our code and dataset are available at https://prefeval.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09597v1-abstract-full').style.display = 'none'; document.getElementById('2502.09597v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at ICLR 2025 as oral presentation. Code and data at: https://prefeval.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08352">arXiv:2502.08352</a> <span> [<a href="https://arxiv.org/pdf/2502.08352">pdf</a>, <a href="https://arxiv.org/format/2502.08352">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Sat-DN: Implicit Surface Reconstruction from Multi-View Satellite Images with Depth and Normal Supervision </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianle Liu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shuangming Zhao</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+W">Wanshou Jiang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+B">Bingxuan Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08352v1-abstract-short" style="display: inline;"> With advancements in satellite imaging technology, acquiring high-resolution multi-view satellite imagery has become increasingly accessible, enabling rapid and location-independent ground model reconstruction. However, traditional stereo matching methods struggle to capture fine details, and while neural radiance fields (NeRFs) achieve high-quality reconstructions, their training time is prohibit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08352v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08352v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08352v1-abstract-full" style="display: none;"> With advancements in satellite imaging technology, acquiring high-resolution multi-view satellite imagery has become increasingly accessible, enabling rapid and location-independent ground model reconstruction. However, traditional stereo matching methods struggle to capture fine details, and while neural radiance fields (NeRFs) achieve high-quality reconstructions, their training time is prohibitively long. Moreover, challenges such as low visibility of building facades, illumination and style differences between pixels, and weakly textured regions in satellite imagery further make it hard to reconstruct reasonable terrain geometry and detailed building facades. To address these issues, we propose Sat-DN, a novel framework leveraging a progressively trained multi-resolution hash grid reconstruction architecture with explicit depth guidance and surface normal consistency constraints to enhance reconstruction quality. The multi-resolution hash grid accelerates training, while the progressive strategy incrementally increases the learning frequency, using coarse low-frequency geometry to guide the reconstruction of fine high-frequency details. The depth and normal constraints ensure a clear building outline and correct planar distribution. Extensive experiments on the DFC2019 dataset demonstrate that Sat-DN outperforms existing methods, achieving state-of-the-art results in both qualitative and quantitative evaluations. The code is available at https://github.com/costune/SatDN. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08352v1-abstract-full').style.display = 'none'; document.getElementById('2502.08352v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08089">arXiv:2502.08089</a> <span> [<a href="https://arxiv.org/pdf/2502.08089">pdf</a>, <a href="https://arxiv.org/format/2502.08089">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> A Cooperative Bearing-Rate Approach for Observability-Enhanced Target Motion Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+C">Canlun Zheng</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hanqing Guo</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shiyu Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08089v1-abstract-short" style="display: inline;"> Vision-based target motion estimation is a fundamental problem in many robotic tasks. The existing methods have the limitation of low observability and, hence, face challenges in tracking highly maneuverable targets. Motivated by the aerial target pursuit task where a target may maneuver in 3D space, this paper studies how to further enhance observability by incorporating the \emph{bearing rate} i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08089v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08089v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08089v1-abstract-full" style="display: none;"> Vision-based target motion estimation is a fundamental problem in many robotic tasks. The existing methods have the limitation of low observability and, hence, face challenges in tracking highly maneuverable targets. Motivated by the aerial target pursuit task where a target may maneuver in 3D space, this paper studies how to further enhance observability by incorporating the \emph{bearing rate} information that has not been well explored in the literature. The main contribution of this paper is to propose a new cooperative estimator called STT-R (Spatial-Temporal Triangulation with bearing Rate), which is designed under the framework of distributed recursive least squares. This theoretical result is further verified by numerical simulation and real-world experiments. It is shown that the proposed STT-R algorithm can effectively generate more accurate estimations and effectively reduce the lag in velocity estimation, enabling tracking of more maneuverable targets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08089v1-abstract-full').style.display = 'none'; document.getElementById('2502.08089v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by icra 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07381">arXiv:2502.07381</a> <span> [<a href="https://arxiv.org/pdf/2502.07381">pdf</a>, <a href="https://arxiv.org/format/2502.07381">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Spatial Degradation-Aware and Temporal Consistent Diffusion Model for Compressed Video Super-Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=An%2C+H">Hongyu An</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xinfeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shijie Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Li Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07381v2-abstract-short" style="display: inline;"> Due to limitations of storage and bandwidth, videos stored and transmitted on the Internet are usually low-quality with low-resolution and compression noise. Although video super-resolution (VSR) is an efficient technique to enhance video resolution, relatively VSR methods focus on compressed videos. Directly applying general VSR approaches leads to the failure of improving practical videos, espec… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07381v2-abstract-full').style.display = 'inline'; document.getElementById('2502.07381v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07381v2-abstract-full" style="display: none;"> Due to limitations of storage and bandwidth, videos stored and transmitted on the Internet are usually low-quality with low-resolution and compression noise. Although video super-resolution (VSR) is an efficient technique to enhance video resolution, relatively VSR methods focus on compressed videos. Directly applying general VSR approaches leads to the failure of improving practical videos, especially when frames are highly compressed at a low bit rate. Recently, diffusion models have achieved superior performance in low-level visual tasks, and their high-realism generation capability enables them to be applied in VSR. To synthesize more compression-lost details and refine temporal consistency, we propose a novel Spatial Degradation-Aware and Temporal Consistent (SDATC) diffusion model for compressed VSR. Specifically, we introduce a distortion Control module (DCM) to modulate diffusion model inputs and guide the generation. Next, the diffusion model executes the denoising process for texture generation with fine-tuned spatial prompt-based compression-aware module (PCAM) and spatio-temporal attention module (STAM). PCAM extracts features to encode specific compression information dynamically. STAM extends the spatial attention mechanism to a spatio-temporal dimension for capturing temporal correlation. Extensive experimental results on benchmark datasets demonstrate the effectiveness of the proposed modules in enhancing compressed videos. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07381v2-abstract-full').style.display = 'none'; document.getElementById('2502.07381v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07331">arXiv:2502.07331</a> <span> [<a href="https://arxiv.org/pdf/2502.07331">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> ERANet: Edge Replacement Augmentation for Semi-Supervised Meniscus Segmentation with Prototype Consistency Alignment and Conditional Self-Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+S">Siyue Li</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yongcheng Yao</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+J">Junru Zhong</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shutian Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yudong Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuihua Wang</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+J">Jin Hong</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Weitian Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07331v1-abstract-short" style="display: inline;"> Manual segmentation is labor-intensive, and automatic segmentation remains challenging due to the inherent variability in meniscal morphology, partial volume effects, and low contrast between the meniscus and surrounding tissues. To address these challenges, we propose ERANet, an innovative semi-supervised framework for meniscus segmentation that effectively leverages both labeled and unlabeled im… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07331v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07331v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07331v1-abstract-full" style="display: none;"> Manual segmentation is labor-intensive, and automatic segmentation remains challenging due to the inherent variability in meniscal morphology, partial volume effects, and low contrast between the meniscus and surrounding tissues. To address these challenges, we propose ERANet, an innovative semi-supervised framework for meniscus segmentation that effectively leverages both labeled and unlabeled images through advanced augmentation and learning strategies. ERANet integrates three key components: edge replacement augmentation (ERA), prototype consistency alignment (PCA), and a conditional self-training (CST) strategy within a mean teacher architecture. ERA introduces anatomically relevant perturbations by simulating meniscal variations, ensuring that augmentations align with the structural context. PCA enhances segmentation performance by aligning intra-class features and promoting compact, discriminative feature representations, particularly in scenarios with limited labeled data. CST improves segmentation robustness by iteratively refining pseudo-labels and mitigating the impact of label noise during training. Together, these innovations establish ERANet as a robust and scalable solution for meniscus segmentation, effectively addressing key barriers to practical implementation. We validated ERANet comprehensively on 3D Double Echo Steady State (DESS) and 3D Fast/Turbo Spin Echo (FSE/TSE) MRI sequences. The results demonstrate the superior performance of ERANet compared to state-of-the-art methods. The proposed framework achieves reliable and accurate segmentation of meniscus structures, even when trained on minimal labeled data. Extensive ablation studies further highlight the synergistic contributions of ERA, PCA, and CST, solidifying ERANet as a transformative solution for semi-supervised meniscus segmentation in medical imaging. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07331v1-abstract-full').style.display = 'none'; document.getElementById('2502.07331v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07288">arXiv:2502.07288</a> <span> [<a href="https://arxiv.org/pdf/2502.07288">pdf</a>, <a href="https://arxiv.org/format/2502.07288">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> KPIs 2024 Challenge: Advancing Glomerular Segmentation from Patch- to Slide-Level </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deng%2C+R">Ruining Deng</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+T">Tianyuan Yao</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yucheng Tang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Junlin Guo</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+S">Siqi Lu</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+J">Juming Xiong</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+L">Lining Yu</a>, <a href="/search/cs?searchtype=author&query=Cap%2C+Q+H">Quan Huu Cap</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+P">Pengzhou Cai</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+L">Libin Lan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Ze Zhao</a>, <a href="/search/cs?searchtype=author&query=Galdran%2C+A">Adrian Galdran</a>, <a href="/search/cs?searchtype=author&query=Kumar%2C+A">Amit Kumar</a>, <a href="/search/cs?searchtype=author&query=Deotale%2C+G">Gunjan Deotale</a>, <a href="/search/cs?searchtype=author&query=Das%2C+D+K">Dev Kumar Das</a>, <a href="/search/cs?searchtype=author&query=Paik%2C+I">Inyoung Paik</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Joonho Lee</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G">Geongyu Lee</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yujia Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wangkai Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhaoyang Li</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+X">Xuege Hou</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zeyuan Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shengjin Wang</a>, <a href="/search/cs?searchtype=author&query=Fischer%2C+M">Maximilian Fischer</a> , et al. (22 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07288v1-abstract-short" style="display: inline;"> Chronic kidney disease (CKD) is a major global health issue, affecting over 10% of the population and causing significant mortality. While kidney biopsy remains the gold standard for CKD diagnosis and treatment, the lack of comprehensive benchmarks for kidney pathology segmentation hinders progress in the field. To address this, we organized the Kidney Pathology Image Segmentation (KPIs) Challenge… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07288v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07288v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07288v1-abstract-full" style="display: none;"> Chronic kidney disease (CKD) is a major global health issue, affecting over 10% of the population and causing significant mortality. While kidney biopsy remains the gold standard for CKD diagnosis and treatment, the lack of comprehensive benchmarks for kidney pathology segmentation hinders progress in the field. To address this, we organized the Kidney Pathology Image Segmentation (KPIs) Challenge, introducing a dataset that incorporates preclinical rodent models of CKD with over 10,000 annotated glomeruli from 60+ Periodic Acid Schiff (PAS)-stained whole slide images. The challenge includes two tasks, patch-level segmentation and whole slide image segmentation and detection, evaluated using the Dice Similarity Coefficient (DSC) and F1-score. By encouraging innovative segmentation methods that adapt to diverse CKD models and tissue conditions, the KPIs Challenge aims to advance kidney pathology analysis, establish new benchmarks, and enable precise, large-scale quantification for disease research and diagnosis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07288v1-abstract-full').style.display = 'none'; document.getElementById('2502.07288v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07158">arXiv:2502.07158</a> <span> [<a href="https://arxiv.org/pdf/2502.07158">pdf</a>, <a href="https://arxiv.org/format/2502.07158">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Early Risk Prediction of Pediatric Cardiac Arrest from Electronic Health Records via Multimodal Fused Transformer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jiaying Lu</a>, <a href="/search/cs?searchtype=author&query=Brown%2C+S+R">Stephanie R. Brown</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Songyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shifan Zhao</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+K">Kejun Dong</a>, <a href="/search/cs?searchtype=author&query=Bold%2C+D">Del Bold</a>, <a href="/search/cs?searchtype=author&query=Fundora%2C+M">Michael Fundora</a>, <a href="/search/cs?searchtype=author&query=Aljiffry%2C+A">Alaa Aljiffry</a>, <a href="/search/cs?searchtype=author&query=Fedorov%2C+A">Alex Fedorov</a>, <a href="/search/cs?searchtype=author&query=Grunwell%2C+J">Jocelyn Grunwell</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xiao Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07158v1-abstract-short" style="display: inline;"> Early prediction of pediatric cardiac arrest (CA) is critical for timely intervention in high-risk intensive care settings. We introduce PedCA-FT, a novel transformer-based framework that fuses tabular view of EHR with the derived textual view of EHR to fully unleash the interactions of high-dimensional risk factors and their dynamics. By employing dedicated transformer modules for each modality v… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07158v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07158v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07158v1-abstract-full" style="display: none;"> Early prediction of pediatric cardiac arrest (CA) is critical for timely intervention in high-risk intensive care settings. We introduce PedCA-FT, a novel transformer-based framework that fuses tabular view of EHR with the derived textual view of EHR to fully unleash the interactions of high-dimensional risk factors and their dynamics. By employing dedicated transformer modules for each modality view, PedCA-FT captures complex temporal and contextual patterns to produce robust CA risk estimates. Evaluated on a curated pediatric cohort from the CHOA-CICU database, our approach outperforms ten other artificial intelligence models across five key performance metrics and identifies clinically meaningful risk factors. These findings underscore the potential of multimodal fusion techniques to enhance early CA detection and improve patient care. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07158v1-abstract-full').style.display = 'none'; document.getElementById('2502.07158v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06734">arXiv:2502.06734</a> <span> [<a href="https://arxiv.org/pdf/2502.06734">pdf</a>, <a href="https://arxiv.org/format/2502.06734">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Se帽orita-2M: A High-Quality Instruction-based Dataset for General Video Editing by Video Specialists </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zi%2C+B">Bojia Zi</a>, <a href="/search/cs?searchtype=author&query=Ruan%2C+P">Penghui Ruan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Marco Chen</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+X">Xianbiao Qi</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+S">Shaozhe Hao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shihao Zhao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Youze Huang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+B">Bin Liang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+R">Rong Xiao</a>, <a href="/search/cs?searchtype=author&query=Wong%2C+K">Kam-Fai Wong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06734v1-abstract-short" style="display: inline;"> Recent advancements in video generation have spurred the development of video editing techniques, which can be divided into inversion-based and end-to-end methods. However, current video editing methods still suffer from several challenges. Inversion-based methods, though training-free and flexible, are time-consuming during inference, struggle with fine-grained editing instructions, and produce a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06734v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06734v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06734v1-abstract-full" style="display: none;"> Recent advancements in video generation have spurred the development of video editing techniques, which can be divided into inversion-based and end-to-end methods. However, current video editing methods still suffer from several challenges. Inversion-based methods, though training-free and flexible, are time-consuming during inference, struggle with fine-grained editing instructions, and produce artifacts and jitter. On the other hand, end-to-end methods, which rely on edited video pairs for training, offer faster inference speeds but often produce poor editing results due to a lack of high-quality training video pairs. In this paper, to close the gap in end-to-end methods, we introduce Se帽orita-2M, a high-quality video editing dataset. Se帽orita-2M consists of approximately 2 millions of video editing pairs. It is built by crafting four high-quality, specialized video editing models, each crafted and trained by our team to achieve state-of-the-art editing results. We also propose a filtering pipeline to eliminate poorly edited video pairs. Furthermore, we explore common video editing architectures to identify the most effective structure based on current pre-trained generative model. Extensive experiments show that our dataset can help to yield remarkably high-quality video editing results. More details are available at https://senorita.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06734v1-abstract-full').style.display = 'none'; document.getElementById('2502.06734v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05863">arXiv:2502.05863</a> <span> [<a href="https://arxiv.org/pdf/2502.05863">pdf</a>, <a href="https://arxiv.org/format/2502.05863">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Uni-Retrieval: A Multi-Style Retrieval Framework for STEM's Education </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jia%2C+Y">Yanhao Jia</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xinyi Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hao Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qinglin Zhang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yuxiao Hu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shuai Zhao</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+W">Wenqi Fan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05863v1-abstract-short" style="display: inline;"> In AI-facilitated teaching, leveraging various query styles to interpret abstract text descriptions is crucial for ensuring high-quality teaching. However, current retrieval models primarily focus on natural text-image retrieval, making them insufficiently tailored to educational scenarios due to the ambiguities in the retrieval process. In this paper, we propose a diverse expression retrieval tas… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05863v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05863v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05863v1-abstract-full" style="display: none;"> In AI-facilitated teaching, leveraging various query styles to interpret abstract text descriptions is crucial for ensuring high-quality teaching. However, current retrieval models primarily focus on natural text-image retrieval, making them insufficiently tailored to educational scenarios due to the ambiguities in the retrieval process. In this paper, we propose a diverse expression retrieval task tailored to educational scenarios, supporting retrieval based on multiple query styles and expressions. We introduce the STEM Education Retrieval Dataset (SER), which contains over 24,000 query pairs of different styles, and the Uni-Retrieval, an efficient and style-diversified retrieval vision-language model based on prompt tuning. Uni-Retrieval extracts query style features as prototypes and builds a continuously updated Prompt Bank containing prompt tokens for diverse queries. This bank can updated during test time to represent domain-specific knowledge for different subject retrieval scenarios. Our framework demonstrates scalability and robustness by dynamically retrieving prompt tokens based on prototype similarity, effectively facilitating learning for unknown queries. Experimental results indicate that Uni-Retrieval outperforms existing retrieval models in most retrieval tasks. This advancement provides a scalable and precise solution for diverse educational needs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05863v1-abstract-full').style.display = 'none'; document.getElementById('2502.05863v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04658">arXiv:2502.04658</a> <span> [<a href="https://arxiv.org/pdf/2502.04658">pdf</a>, <a href="https://arxiv.org/format/2502.04658">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Shifting Attention to You: Personalized Brain-Inspired AI Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+S+C">Stephen Chong Zhao</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yang Hu</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Jason Lee</a>, <a href="/search/cs?searchtype=author&query=Bender%2C+A">Andrew Bender</a>, <a href="/search/cs?searchtype=author&query=Mazumdar%2C+T">Trisha Mazumdar</a>, <a href="/search/cs?searchtype=author&query=Wallace%2C+M">Mark Wallace</a>, <a href="/search/cs?searchtype=author&query=Tovar%2C+D+A">David A. Tovar</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04658v1-abstract-short" style="display: inline;"> The integration of human and artificial intelligence represents a scientific opportunity to advance our understanding of information processing, as each system offers unique computational insights that can enhance and inform the other. The synthesis of human cognitive principles with artificial intelligence has the potential to produce more interpretable and functionally aligned computational mode… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04658v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04658v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04658v1-abstract-full" style="display: none;"> The integration of human and artificial intelligence represents a scientific opportunity to advance our understanding of information processing, as each system offers unique computational insights that can enhance and inform the other. The synthesis of human cognitive principles with artificial intelligence has the potential to produce more interpretable and functionally aligned computational models, while simultaneously providing a formal framework for investigating the neural mechanisms underlying perception, learning, and decision-making through systematic model comparisons and representational analyses. In this study, we introduce personalized brain-inspired modeling that integrates human behavioral embeddings and neural data to align with cognitive processes. We took a stepwise approach, fine-tuning the Contrastive Language-Image Pre-training (CLIP) model with large-scale behavioral decisions, group-level neural data, and finally, participant-level neural data within a broader framework that we have named CLIP-Human-Based Analysis (CLIP-HBA). We found that fine-tuning on behavioral data enhances its ability to predict human similarity judgments while indirectly aligning it with dynamic representations captured via MEG. To further gain mechanistic insights into the temporal evolution of cognitive processes, we introduced a model specifically fine-tuned on millisecond-level MEG neural dynamics (CLIP-HBA-MEG). This model resulted in enhanced temporal alignment with human neural processing while still showing improvement on behavioral alignment. Finally, we trained individualized models on participant-specific neural data, effectively capturing individualized neural dynamics and highlighting the potential for personalized AI systems. These personalized systems have far-reaching implications for the fields of medicine, cognitive research, human-computer interfaces, and AI development. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04658v1-abstract-full').style.display = 'none'; document.getElementById('2502.04658v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 Figures, 3 Tables, 3 Supplemental Figures, 1 Supplemental Table</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04377">arXiv:2502.04377</a> <span> [<a href="https://arxiv.org/pdf/2502.04377">pdf</a>, <a href="https://arxiv.org/format/2502.04377">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MapFusion: A Novel BEV Feature Fusion Network for Multi-modal Map Construction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hao%2C+X">Xiaoshuai Hao</a>, <a href="/search/cs?searchtype=author&query=Diao%2C+Y">Yunfeng Diao</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+M">Mengchuan Wei</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yifan Yang</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+P">Peng Hao</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+R">Rong Yin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hui Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Weiming Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shu Zhao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yu Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04377v1-abstract-short" style="display: inline;"> Map construction task plays a vital role in providing precise and comprehensive static environmental information essential for autonomous driving systems. Primary sensors include cameras and LiDAR, with configurations varying between camera-only, LiDAR-only, or camera-LiDAR fusion, based on cost-performance considerations. While fusion-based methods typically perform best, existing approaches ofte… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04377v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04377v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04377v1-abstract-full" style="display: none;"> Map construction task plays a vital role in providing precise and comprehensive static environmental information essential for autonomous driving systems. Primary sensors include cameras and LiDAR, with configurations varying between camera-only, LiDAR-only, or camera-LiDAR fusion, based on cost-performance considerations. While fusion-based methods typically perform best, existing approaches often neglect modality interaction and rely on simple fusion strategies, which suffer from the problems of misalignment and information loss. To address these issues, we propose MapFusion, a novel multi-modal Bird's-Eye View (BEV) feature fusion method for map construction. Specifically, to solve the semantic misalignment problem between camera and LiDAR BEV features, we introduce the Cross-modal Interaction Transform (CIT) module, enabling interaction between two BEV feature spaces and enhancing feature representation through a self-attention mechanism. Additionally, we propose an effective Dual Dynamic Fusion (DDF) module to adaptively select valuable information from different modalities, which can take full advantage of the inherent information between different modalities. Moreover, MapFusion is designed to be simple and plug-and-play, easily integrated into existing pipelines. We evaluate MapFusion on two map construction tasks, including High-definition (HD) map and BEV map segmentation, to show its versatility and effectiveness. Compared with the state-of-the-art methods, MapFusion achieves 3.6% and 6.2% absolute improvements on the HD map construction and BEV map segmentation tasks on the nuScenes dataset, respectively, demonstrating the superiority of our approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04377v1-abstract-full').style.display = 'none'; document.getElementById('2502.04377v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03378">arXiv:2502.03378</a> <span> [<a href="https://arxiv.org/pdf/2502.03378">pdf</a>, <a href="https://arxiv.org/format/2502.03378">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Learning to Identify Conflicts in RPKI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Schulmann%2C+H">Haya Schulmann</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shujie Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03378v1-abstract-short" style="display: inline;"> The long history of misconfigurations and errors in RPKI indicates that they cannot be easily avoided and will most probably persist also in the future. These errors create conflicts between BGP announcements and their covering ROAs, causing the RPKI validation to result in status invalid. Networks that enforce RPKI filtering with Route Origin Validation (ROV) would block such conflicting BGP anno… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03378v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03378v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03378v1-abstract-full" style="display: none;"> The long history of misconfigurations and errors in RPKI indicates that they cannot be easily avoided and will most probably persist also in the future. These errors create conflicts between BGP announcements and their covering ROAs, causing the RPKI validation to result in status invalid. Networks that enforce RPKI filtering with Route Origin Validation (ROV) would block such conflicting BGP announcements and as a result lose traffic from the corresponding origins. Since the business incentives of networks are tightly coupled with the traffic they relay, filtering legitimate traffic leads to a loss of revenue, reducing the motivation to filter invalid announcements with ROV. In this work, we introduce a new mechanism, LOV, designed for whitelisting benign conflicts on an Internet scale. The resulting whitelist is made available to RPKI supporting ASes to avoid filtering RPKI-invalid but benign routes. Saving legitimate traffic resolves one main obstacle towards RPKI deployment. We measure live BGP updates using LOV during a period of half a year and whitelist 52,846 routes with benign origin errors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03378v1-abstract-full').style.display = 'none'; document.getElementById('2502.03378v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03226">arXiv:2502.03226</a> <span> [<a href="https://arxiv.org/pdf/2502.03226">pdf</a>, <a href="https://arxiv.org/format/2502.03226">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> COLP: Scaffolding Children's Online Long-term Collaborative Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zha%2C+S">Siyu Zha</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yuanrong Tang</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+J">Jiangtao Gong</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yingqing Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03226v1-abstract-short" style="display: inline;"> Online collaborative learning and working are important for everyone including children. However, children still face a lot of difficulties communicating and working together while online, which keeps them from engaging in long-term project-based teamwork. We aim to investigate online long-term collaborative learning opportunities to address this gap. We design COLP, an online, 16-week, project-ba… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03226v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03226v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03226v1-abstract-full" style="display: none;"> Online collaborative learning and working are important for everyone including children. However, children still face a lot of difficulties communicating and working together while online, which keeps them from engaging in long-term project-based teamwork. We aim to investigate online long-term collaborative learning opportunities to address this gap. We design COLP, an online, 16-week, project-based learning program, as an educational intervention based on multiple learning theories for primary school students. We conducted this program with 67 primary school students ages 8-13, across more than five provinces of China. We found that this program could engage more than one-third of children in teamwork after long-term study. Furthermore, we interview children and their parents to help us understand the communication channel, benefits, and challenges of this program. Interestingly, we discovered that parents play multiple roles in their children's collaborative learning, particularly modeling and guiding the children's collaborative skills. Given the lack of programs designed for children's long-term online collaboration, this study may inspire intervention design in computer-supported collaborative learning communities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03226v1-abstract-full').style.display = 'none'; document.getElementById('2502.03226v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">42 pages, 13 figures. Submitted to International Journal of Human-Computer Interaction</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68U35 (Primary); 68T50 (Secondary) <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> H.5.2; K.3.1 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02581">arXiv:2502.02581</a> <span> [<a href="https://arxiv.org/pdf/2502.02581">pdf</a>, <a href="https://arxiv.org/format/2502.02581">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Hecate: Unlocking Efficient Sparse Model Training via Fully Sharded Sparse Data Parallelism </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qing%2C+Y">Yuhao Qing</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+G">Guichao Zhu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+F">Fanxin Li</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+L">Lintian Lei</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Z">Zekai Sun</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+X">Xiuxian Guan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shixiong Zhao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xusheng Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Dong Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Sen Wang</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+H">Heming Cui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02581v1-abstract-short" style="display: inline;"> Mixture-of-Experts (MoE) has emerged as a promising sparse paradigm for scaling up pre-trained models (PTMs) with remarkable cost-effectiveness. However, the dynamic nature of MoE leads to rapid fluctuations and imbalances in expert loads during training, resulting in significant straggler effects that hinder training performance when using expert parallelism (EP). Existing MoE training systems at… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02581v1-abstract-full').style.display = 'inline'; document.getElementById('2502.02581v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02581v1-abstract-full" style="display: none;"> Mixture-of-Experts (MoE) has emerged as a promising sparse paradigm for scaling up pre-trained models (PTMs) with remarkable cost-effectiveness. However, the dynamic nature of MoE leads to rapid fluctuations and imbalances in expert loads during training, resulting in significant straggler effects that hinder training performance when using expert parallelism (EP). Existing MoE training systems attempt to mitigate these effects through expert rearrangement strategies, but they face challenges in terms of memory efficiency and timeliness of rearrangement. This paper proposes Fully Sharded Sparse Data Parallelism (FSSDP), an innovative approach that tackles the parallelization of MoE layers and potential straggler effects caused by imbalanced expert loads from a new perspective. FSSDP fully shards the parameters and optimizer states of MoE layers across devices and sparsely materializes MoE parameters from scratch in each iteration with two sparse collectives SparseAllGather and SparseReduceScatter. We build Hecate, a high-performance MoE training system that incorporates FSSDP to fully unlock its potential. Hecate introduces heterogeneous sharding, sparse materialization, and re-materialization techniques to construct flexible and efficient expert placements with low memory and communication overhead. Our evaluation reveals that Hecate achieves up to 3.54x speedup compared over state-of-the-art MoE training systems and consistently demonstrates improvements across model architectures and hardware environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02581v1-abstract-full').style.display = 'none'; document.getElementById('2502.02581v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01960">arXiv:2502.01960</a> <span> [<a href="https://arxiv.org/pdf/2502.01960">pdf</a>, <a href="https://arxiv.org/format/2502.01960">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MPIC: Position-Independent Multimodal Context Caching System for Efficient MLLM Serving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shiju Zhao</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Junhao Hu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+R">Rongxiao Huang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+J">Jiaqi Zheng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guihai Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01960v1-abstract-short" style="display: inline;"> The context caching technique is employed to accelerate the Multimodal Large Language Model (MLLM) inference by prevailing serving platforms currently. However, this approach merely reuses the Key-Value (KV) cache of the initial sequence of prompt, resulting in full KV cache recomputation even if the prefix differs slightly. This becomes particularly inefficient in the context of interleaved text… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01960v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01960v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01960v1-abstract-full" style="display: none;"> The context caching technique is employed to accelerate the Multimodal Large Language Model (MLLM) inference by prevailing serving platforms currently. However, this approach merely reuses the Key-Value (KV) cache of the initial sequence of prompt, resulting in full KV cache recomputation even if the prefix differs slightly. This becomes particularly inefficient in the context of interleaved text and images, as well as multimodal retrieval-augmented generation. This paper proposes position-independent caching as a more effective approach for multimodal information management. We have designed and implemented a caching system, named MPIC, to address both system-level and algorithm-level challenges. MPIC stores the KV cache on local or remote disks when receiving multimodal data, and calculates and loads the KV cache in parallel during inference. To mitigate accuracy degradation, we have incorporated integrated reuse and recompute mechanisms within the system. The experimental results demonstrate that MPIC can achieve up to 54% reduction in response time compared to existing context caching systems, while maintaining negligible or no accuracy loss. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01960v1-abstract-full').style.display = 'none'; document.getElementById('2502.01960v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 11 figures, the first version</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00896">arXiv:2502.00896</a> <span> [<a href="https://arxiv.org/pdf/2502.00896">pdf</a>, <a href="https://arxiv.org/format/2502.00896">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LoR-VP: Low-Rank Visual Prompting for Efficient Vision Model Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jin%2C+C">Can Jin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Ying Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+M">Mingyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shiyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhenting Wang</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiaoxiao He</a>, <a href="/search/cs?searchtype=author&query=Han%2C+L">Ligong Han</a>, <a href="/search/cs?searchtype=author&query=Che%2C+T">Tong Che</a>, <a href="/search/cs?searchtype=author&query=Metaxas%2C+D+N">Dimitris N. Metaxas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00896v2-abstract-short" style="display: inline;"> Visual prompting has gained popularity as a method for adapting pre-trained models to specific tasks, particularly in the realm of parameter-efficient tuning. However, existing visual prompting techniques often pad the prompt parameters around the image, limiting the interaction between the visual prompts and the original image to a small set of patches while neglecting the inductive bias present… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00896v2-abstract-full').style.display = 'inline'; document.getElementById('2502.00896v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00896v2-abstract-full" style="display: none;"> Visual prompting has gained popularity as a method for adapting pre-trained models to specific tasks, particularly in the realm of parameter-efficient tuning. However, existing visual prompting techniques often pad the prompt parameters around the image, limiting the interaction between the visual prompts and the original image to a small set of patches while neglecting the inductive bias present in shared information across different patches. In this study, we conduct a thorough preliminary investigation to identify and address these limitations. We propose a novel visual prompt design, introducing Low-Rank matrix multiplication for Visual Prompting (LoR-VP), which enables shared and patch-specific information across rows and columns of image pixels. Extensive experiments across seven network architectures and four datasets demonstrate significant improvements in both performance and efficiency compared to state-of-the-art visual prompting methods, achieving up to 6 times faster training times, utilizing 18 times fewer visual prompt parameters, and delivering a 3.1% improvement in performance. The code is available as https://github.com/jincan333/LoR-VP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00896v2-abstract-full').style.display = 'none'; document.getElementById('2502.00896v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18124">arXiv:2501.18124</a> <span> [<a href="https://arxiv.org/pdf/2501.18124">pdf</a>, <a href="https://arxiv.org/format/2501.18124">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> REMOTE: Real-time Ego-motion Tracking for Various Endoscopes via Multimodal Visual Feature Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shao%2C+L">Liangjing Shao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Benshuang Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shuting Zhao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xinrong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18124v2-abstract-short" style="display: inline;"> Real-time ego-motion tracking for endoscope is a significant task for efficient navigation and robotic automation of endoscopy. In this paper, a novel framework is proposed to perform real-time ego-motion tracking for endoscope. Firstly, a multi-modal visual feature learning network is proposed to perform relative pose prediction, in which the motion feature from the optical flow, the scene featur… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18124v2-abstract-full').style.display = 'inline'; document.getElementById('2501.18124v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18124v2-abstract-full" style="display: none;"> Real-time ego-motion tracking for endoscope is a significant task for efficient navigation and robotic automation of endoscopy. In this paper, a novel framework is proposed to perform real-time ego-motion tracking for endoscope. Firstly, a multi-modal visual feature learning network is proposed to perform relative pose prediction, in which the motion feature from the optical flow, the scene features and the joint feature from two adjacent observations are all extracted for prediction. Due to more correlation information in the channel dimension of the concatenated image, a novel feature extractor is designed based on an attention mechanism to integrate multi-dimensional information from the concatenation of two continuous frames. To extract more complete feature representation from the fused features, a novel pose decoder is proposed to predict the pose transformation from the concatenated feature map at the end of the framework. At last, the absolute pose of endoscope is calculated based on relative poses. The experiment is conducted on three datasets of various endoscopic scenes and the results demonstrate that the proposed method outperforms state-of-the-art methods. Besides, the inference speed of the proposed method is over 30 frames per second, which meets the real-time requirement. The project page is here: remote-bmxs.netlify.app <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18124v2-abstract-full').style.display = 'none'; document.getElementById('2501.18124v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICRA 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.17424">arXiv:2501.17424</a> <span> [<a href="https://arxiv.org/pdf/2501.17424">pdf</a>, <a href="https://arxiv.org/format/2501.17424">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Certificated Actor-Critic: Hierarchical Reinforcement Learning with Control Barrier Functions for Safe Navigation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+J">Junjun Xie</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shuhao Zhao</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+L">Liang Hu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+H">Huijun Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.17424v1-abstract-short" style="display: inline;"> Control Barrier Functions (CBFs) have emerged as a prominent approach to designing safe navigation systems of robots. Despite their popularity, current CBF-based methods exhibit some limitations: optimization-based safe control techniques tend to be either myopic or computationally intensive, and they rely on simplified system models; conversely, the learning-based methods suffer from the lack of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17424v1-abstract-full').style.display = 'inline'; document.getElementById('2501.17424v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.17424v1-abstract-full" style="display: none;"> Control Barrier Functions (CBFs) have emerged as a prominent approach to designing safe navigation systems of robots. Despite their popularity, current CBF-based methods exhibit some limitations: optimization-based safe control techniques tend to be either myopic or computationally intensive, and they rely on simplified system models; conversely, the learning-based methods suffer from the lack of quantitative indication in terms of navigation performance and safety. In this paper, we present a new model-free reinforcement learning algorithm called Certificated Actor-Critic (CAC), which introduces a hierarchical reinforcement learning framework and well-defined reward functions derived from CBFs. We carry out theoretical analysis and proof of our algorithm, and propose several improvements in algorithm implementation. Our analysis is validated by two simulation experiments, showing the effectiveness of our proposed CAC algorithm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17424v1-abstract-full').style.display = 'none'; document.getElementById('2501.17424v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICRA 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16761">arXiv:2501.16761</a> <span> [<a href="https://arxiv.org/pdf/2501.16761">pdf</a>, <a href="https://arxiv.org/format/2501.16761">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> CosyAudio: Improving Audio Generation with Confidence Scores and Synthetic Captions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xinfa Zhu</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+W">Wenjie Tian</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinsheng Wang</a>, <a href="/search/cs?searchtype=author&query=He%2C+L">Lei He</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xi Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Sheng Zhao</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+L">Lei Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16761v1-abstract-short" style="display: inline;"> Text-to-Audio (TTA) generation is an emerging area within AI-generated content (AIGC), where audio is created from natural language descriptions. Despite growing interest, developing robust TTA models remains challenging due to the scarcity of well-labeled datasets and the prevalence of noisy or inaccurate captions in large-scale, weakly labeled corpora. To address these challenges, we propose Cos… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16761v1-abstract-full').style.display = 'inline'; document.getElementById('2501.16761v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16761v1-abstract-full" style="display: none;"> Text-to-Audio (TTA) generation is an emerging area within AI-generated content (AIGC), where audio is created from natural language descriptions. Despite growing interest, developing robust TTA models remains challenging due to the scarcity of well-labeled datasets and the prevalence of noisy or inaccurate captions in large-scale, weakly labeled corpora. To address these challenges, we propose CosyAudio, a novel framework that utilizes confidence scores and synthetic captions to enhance the quality of audio generation. CosyAudio consists of two core components: AudioCapTeller and an audio generator. AudioCapTeller generates synthetic captions for audio and provides confidence scores to evaluate their accuracy. The audio generator uses these synthetic captions and confidence scores to enable quality-aware audio generation. Additionally, we introduce a self-evolving training strategy that iteratively optimizes CosyAudio across both well-labeled and weakly-labeled datasets. Initially trained with well-labeled data, AudioCapTeller leverages its assessment capabilities on weakly-labeled datasets for high-quality filtering and reinforcement learning, which further improves its performance. The well-trained AudioCapTeller refines corpora by generating new captions and confidence scores, serving for the audio generator training. Extensive experiments on open-source datasets demonstrate that CosyAudio outperforms existing models in automated audio captioning, generates more faithful audio, and exhibits strong generalization across diverse scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16761v1-abstract-full').style.display = 'none'; document.getElementById('2501.16761v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 5 figures, 7 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16515">arXiv:2501.16515</a> <span> [<a href="https://arxiv.org/pdf/2501.16515">pdf</a>, <a href="https://arxiv.org/format/2501.16515">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> SimulataR: Rapid Assisted Reality Prototyping using Design-Blended Videos </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ram%2C+A">Ashwin Ram</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yue Gu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bowen Wang</a>, <a href="/search/cs?searchtype=author&query=Jaikumar%2C+S">Sneha Jaikumar</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Youqi Wu</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+B+T+K">Benjamin Tan Kuan Wei</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Q">Qingyang Xu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haiming Liu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shengdong Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16515v2-abstract-short" style="display: inline;"> Assisted Reality (aR) is a subfield of Augmented Reality (AR) that overlays information onto a user's immediate view via see-through head-mounted displays (OST-HMDs). This technology has proven to be effective and energy-efficient to support the user and information interaction for everyday wearable intelligent systems. The aR viewing experience, however, is affected by varying real-world backgrou… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16515v2-abstract-full').style.display = 'inline'; document.getElementById('2501.16515v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16515v2-abstract-full" style="display: none;"> Assisted Reality (aR) is a subfield of Augmented Reality (AR) that overlays information onto a user's immediate view via see-through head-mounted displays (OST-HMDs). This technology has proven to be effective and energy-efficient to support the user and information interaction for everyday wearable intelligent systems. The aR viewing experience, however, is affected by varying real-world backgrounds, lighting, and user movements, which makes designing for aR challenging. Designers have to test their designs in-situ across multiple real-world settings, which can be time-consuming and labor-intensive. We propose SimulataR, a cost-effective desktop-based approach for rapid aR prototyping using first-person-view context videos blended with design prototypes to simulate an aR experience. A field study involving 12 AR users comparing SimulataR to real OST-HMDs found that SimulataR can approximate the aR experience, particularly for indoors and in low-to-moderate lit outdoor environments. Case studies with two designers who used SimulataR in their design process demonstrates the potential of design-blended videos for rapid aR prototyping. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16515v2-abstract-full').style.display = 'none'; document.getElementById('2501.16515v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16240">arXiv:2501.16240</a> <span> [<a href="https://arxiv.org/pdf/2501.16240">pdf</a>, <a href="https://arxiv.org/format/2501.16240">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3706598.3713953">10.1145/3706598.3713953 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> AiGet: Transforming Everyday Moments into Hidden Knowledge Discovery with AI Assistance on Smart Glasses </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cai%2C+R">Runze Cai</a>, <a href="/search/cs?searchtype=author&query=Janaka%2C+N">Nuwan Janaka</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+H">Hyeongcheol Kim</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yang Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shengdong Zhao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yun Huang</a>, <a href="/search/cs?searchtype=author&query=Hsu%2C+D">David Hsu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16240v1-abstract-short" style="display: inline;"> Unlike the free exploration of childhood, the demands of daily life reduce our motivation to explore our surroundings, leading to missed opportunities for informal learning. Traditional tools for knowledge acquisition are reactive, relying on user initiative and limiting their ability to uncover hidden interests. Through formative studies, we introduce AiGet, a proactive AI assistant integrated wi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16240v1-abstract-full').style.display = 'inline'; document.getElementById('2501.16240v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16240v1-abstract-full" style="display: none;"> Unlike the free exploration of childhood, the demands of daily life reduce our motivation to explore our surroundings, leading to missed opportunities for informal learning. Traditional tools for knowledge acquisition are reactive, relying on user initiative and limiting their ability to uncover hidden interests. Through formative studies, we introduce AiGet, a proactive AI assistant integrated with AR smart glasses, designed to seamlessly embed informal learning into low-demand daily activities (e.g., casual walking and shopping). AiGet analyzes real-time user gaze patterns, environmental context, and user profiles, leveraging large language models to deliver personalized, context-aware knowledge with low disruption to primary tasks. In-lab evaluations and real-world testing, including continued use over multiple days, demonstrate AiGet's effectiveness in uncovering overlooked yet surprising interests, enhancing primary task enjoyment, reviving curiosity, and deepening connections with the environment. We further propose design guidelines for AI-assisted informal learning, focused on transforming everyday moments into enriching learning experiences. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16240v1-abstract-full').style.display = 'none'; document.getElementById('2501.16240v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Conditionally accepted at CHI Conference on Human Factors in Computing Systems (CHI '25), April 26-May 1, 2025, Yokohama, Japan</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.10; H.5.1; H.5.2 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15659">arXiv:2501.15659</a> <span> [<a href="https://arxiv.org/pdf/2501.15659">pdf</a>, <a href="https://arxiv.org/format/2501.15659">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> AirIO: Learning Inertial Odometry with Enhanced IMU Feature Observability </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qiu%2C+Y">Yuheng Qiu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Can Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yutian Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shibo Zhao</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+J">Junyi Geng</a>, <a href="/search/cs?searchtype=author&query=Scherer%2C+S">Sebastian Scherer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15659v1-abstract-short" style="display: inline;"> Inertial odometry (IO) using only Inertial Measurement Units (IMUs) offers a lightweight and cost-effective solution for Unmanned Aerial Vehicle (UAV) applications, yet existing learning-based IO models often fail to generalize to UAVs due to the highly dynamic and non-linear-flight patterns that differ from pedestrian motion. In this work, we identify that the conventional practice of transformin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15659v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15659v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15659v1-abstract-full" style="display: none;"> Inertial odometry (IO) using only Inertial Measurement Units (IMUs) offers a lightweight and cost-effective solution for Unmanned Aerial Vehicle (UAV) applications, yet existing learning-based IO models often fail to generalize to UAVs due to the highly dynamic and non-linear-flight patterns that differ from pedestrian motion. In this work, we identify that the conventional practice of transforming raw IMU data to global coordinates undermines the observability of critical kinematic information in UAVs. By preserving the body-frame representation, our method achieves substantial performance improvements, with a 66.7% average increase in accuracy across three datasets. Furthermore, explicitly encoding attitude information into the motion network results in an additional 23.8% improvement over prior results. Combined with a data-driven IMU correction model (AirIMU) and an uncertainty-aware Extended Kalman Filter (EKF), our approach ensures robust state estimation under aggressive UAV maneuvers without relying on external sensors or control inputs. Notably, our method also demonstrates strong generalizability to unseen data not included in the training set, underscoring its potential for real-world UAV applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15659v1-abstract-full').style.display = 'none'; document.getElementById('2501.15659v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15368">arXiv:2501.15368</a> <span> [<a href="https://arxiv.org/pdf/2501.15368">pdf</a>, <a href="https://arxiv.org/format/2501.15368">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Baichuan-Omni-1.5 Technical Report </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yadong Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jun Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tao Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tao Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Song Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tianpeng Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zehuan Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lijun Liu</a>, <a href="/search/cs?searchtype=author&query=Ming%2C+L">Lingfeng Ming</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+G">Guosheng Dong</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+D">Da Pan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chong Li</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yuanbo Fang</a>, <a href="/search/cs?searchtype=author&query=Kuang%2C+D">Dongdong Kuang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mingrui Wang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+C">Chenglin Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Youwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hongyu Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+F">Fengyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuran Wang</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+B">Bowen Ding</a>, <a href="/search/cs?searchtype=author&query=Song%2C+W">Wei Song</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xu Li</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+Y">Yuqi Huo</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Z">Zheng Liang</a> , et al. (68 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15368v1-abstract-short" style="display: inline;"> We introduce Baichuan-Omni-1.5, an omni-modal model that not only has omni-modal understanding capabilities but also provides end-to-end audio generation capabilities. To achieve fluent and high-quality interaction across modalities without compromising the capabilities of any modality, we prioritized optimizing three key aspects. First, we establish a comprehensive data cleaning and synthesis pip… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15368v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15368v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15368v1-abstract-full" style="display: none;"> We introduce Baichuan-Omni-1.5, an omni-modal model that not only has omni-modal understanding capabilities but also provides end-to-end audio generation capabilities. To achieve fluent and high-quality interaction across modalities without compromising the capabilities of any modality, we prioritized optimizing three key aspects. First, we establish a comprehensive data cleaning and synthesis pipeline for multimodal data, obtaining about 500B high-quality data (text, audio, and vision). Second, an audio-tokenizer (Baichuan-Audio-Tokenizer) has been designed to capture both semantic and acoustic information from audio, enabling seamless integration and enhanced compatibility with MLLM. Lastly, we designed a multi-stage training strategy that progressively integrates multimodal alignment and multitask fine-tuning, ensuring effective synergy across all modalities. Baichuan-Omni-1.5 leads contemporary models (including GPT4o-mini and MiniCPM-o 2.6) in terms of comprehensive omni-modal capabilities. Notably, it achieves results comparable to leading models such as Qwen2-VL-72B across various multimodal medical benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15368v1-abstract-full').style.display = 'none'; document.getElementById('2501.15368v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14319">arXiv:2501.14319</a> <span> [<a href="https://arxiv.org/pdf/2501.14319">pdf</a>, <a href="https://arxiv.org/format/2501.14319">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Scalable Benchmarking and Robust Learning for Noise-Free Ego-Motion and 3D Reconstruction from Noisy Video </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiaohao Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tianyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shibo Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiang Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Sibo Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yongqi Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Ye Li</a>, <a href="/search/cs?searchtype=author&query=Raj%2C+B">Bhiksha Raj</a>, <a href="/search/cs?searchtype=author&query=Johnson-Roberson%2C+M">Matthew Johnson-Roberson</a>, <a href="/search/cs?searchtype=author&query=Scherer%2C+S">Sebastian Scherer</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xiaonan Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14319v1-abstract-short" style="display: inline;"> We aim to redefine robust ego-motion estimation and photorealistic 3D reconstruction by addressing a critical limitation: the reliance on noise-free data in existing models. While such sanitized conditions simplify evaluation, they fail to capture the unpredictable, noisy complexities of real-world environments. Dynamic motion, sensor imperfections, and synchronization perturbations lead to sharp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14319v1-abstract-full').style.display = 'inline'; document.getElementById('2501.14319v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14319v1-abstract-full" style="display: none;"> We aim to redefine robust ego-motion estimation and photorealistic 3D reconstruction by addressing a critical limitation: the reliance on noise-free data in existing models. While such sanitized conditions simplify evaluation, they fail to capture the unpredictable, noisy complexities of real-world environments. Dynamic motion, sensor imperfections, and synchronization perturbations lead to sharp performance declines when these models are deployed in practice, revealing an urgent need for frameworks that embrace and excel under real-world noise. To bridge this gap, we tackle three core challenges: scalable data generation, comprehensive benchmarking, and model robustness enhancement. First, we introduce a scalable noisy data synthesis pipeline that generates diverse datasets simulating complex motion, sensor imperfections, and synchronization errors. Second, we leverage this pipeline to create Robust-Ego3D, a benchmark rigorously designed to expose noise-induced performance degradation, highlighting the limitations of current learning-based methods in ego-motion accuracy and 3D reconstruction quality. Third, we propose Correspondence-guided Gaussian Splatting (CorrGS), a novel test-time adaptation method that progressively refines an internal clean 3D representation by aligning noisy observations with rendered RGB-D frames from clean 3D map, enhancing geometric alignment and appearance restoration through visual correspondence. Extensive experiments on synthetic and real-world data demonstrate that CorrGS consistently outperforms prior state-of-the-art methods, particularly in scenarios involving rapid motion and dynamic illumination. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14319v1-abstract-full').style.display = 'none'; document.getElementById('2501.14319v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICLR 2025; 92 Pages; Project Repo: https://github.com/Xiaohao-Xu/SLAM-under-Perturbation. arXiv admin note: substantial text overlap with arXiv:2406.16850</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14249">arXiv:2501.14249</a> <span> [<a href="https://arxiv.org/pdf/2501.14249">pdf</a>, <a href="https://arxiv.org/format/2501.14249">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Humanity's Last Exam </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Phan%2C+L">Long Phan</a>, <a href="/search/cs?searchtype=author&query=Gatti%2C+A">Alice Gatti</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Ziwen Han</a>, <a href="/search/cs?searchtype=author&query=Li%2C+N">Nathaniel Li</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Josephina Hu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hugh Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C+B+C">Chen Bo Calvin Zhang</a>, <a href="/search/cs?searchtype=author&query=Shaaban%2C+M">Mohamed Shaaban</a>, <a href="/search/cs?searchtype=author&query=Ling%2C+J">John Ling</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+S">Sean Shi</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+M">Michael Choi</a>, <a href="/search/cs?searchtype=author&query=Agrawal%2C+A">Anish Agrawal</a>, <a href="/search/cs?searchtype=author&query=Chopra%2C+A">Arnav Chopra</a>, <a href="/search/cs?searchtype=author&query=Khoja%2C+A">Adam Khoja</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+R">Ryan Kim</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+R">Richard Ren</a>, <a href="/search/cs?searchtype=author&query=Hausenloy%2C+J">Jason Hausenloy</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+O">Oliver Zhang</a>, <a href="/search/cs?searchtype=author&query=Mazeika%2C+M">Mantas Mazeika</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+T">Tung Nguyen</a>, <a href="/search/cs?searchtype=author&query=Anderson%2C+D">Daron Anderson</a>, <a href="/search/cs?searchtype=author&query=Shah%2C+I+A">Imad Ali Shah</a>, <a href="/search/cs?searchtype=author&query=Doroshenko%2C+M">Mikhail Doroshenko</a>, <a href="/search/cs?searchtype=author&query=Stokes%2C+A+C">Alun Cennyth Stokes</a>, <a href="/search/cs?searchtype=author&query=Mahmood%2C+M">Mobeen Mahmood</a> , et al. (710 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14249v3-abstract-short" style="display: inline;"> Benchmarks are important tools for tracking the rapid advancements in large language model (LLM) capabilities. However, benchmarks are not keeping pace in difficulty: LLMs now achieve over 90\% accuracy on popular benchmarks like MMLU, limiting informed measurement of state-of-the-art LLM capabilities. In response, we introduce Humanity's Last Exam (HLE), a multi-modal benchmark at the frontier of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14249v3-abstract-full').style.display = 'inline'; document.getElementById('2501.14249v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14249v3-abstract-full" style="display: none;"> Benchmarks are important tools for tracking the rapid advancements in large language model (LLM) capabilities. However, benchmarks are not keeping pace in difficulty: LLMs now achieve over 90\% accuracy on popular benchmarks like MMLU, limiting informed measurement of state-of-the-art LLM capabilities. In response, we introduce Humanity's Last Exam (HLE), a multi-modal benchmark at the frontier of human knowledge, designed to be the final closed-ended academic benchmark of its kind with broad subject coverage. HLE consists of 3,000 questions across dozens of subjects, including mathematics, humanities, and the natural sciences. HLE is developed globally by subject-matter experts and consists of multiple-choice and short-answer questions suitable for automated grading. Each question has a known solution that is unambiguous and easily verifiable, but cannot be quickly answered via internet retrieval. State-of-the-art LLMs demonstrate low accuracy and calibration on HLE, highlighting a significant gap between current LLM capabilities and the expert human frontier on closed-ended academic questions. To inform research and policymaking upon a clear understanding of model capabilities, we publicly release HLE at https://lastexam.ai. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14249v3-abstract-full').style.display = 'none'; document.getElementById('2501.14249v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">26 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14166">arXiv:2501.14166</a> <span> [<a href="https://arxiv.org/pdf/2501.14166">pdf</a>, <a href="https://arxiv.org/format/2501.14166">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Multimodal Entity Linking with Jaccard Distance-based Conditional Contrastive Learning and Contextual Visual Augmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nguyen%2C+C">Cong-Duy Nguyen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xiaobao Wu</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+T">Thong Nguyen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shuai Zhao</a>, <a href="/search/cs?searchtype=author&query=Le%2C+K">Khoi Le</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+V">Viet-Anh Nguyen</a>, <a href="/search/cs?searchtype=author&query=Yichao%2C+F">Feng Yichao</a>, <a href="/search/cs?searchtype=author&query=Luu%2C+A+T">Anh Tuan Luu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14166v1-abstract-short" style="display: inline;"> Previous research on multimodal entity linking (MEL) has primarily employed contrastive learning as the primary objective. However, using the rest of the batch as negative samples without careful consideration, these studies risk leveraging easy features and potentially overlook essential details that make entities unique. In this work, we propose JD-CCL (Jaccard Distance-based Conditional Contras… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14166v1-abstract-full').style.display = 'inline'; document.getElementById('2501.14166v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14166v1-abstract-full" style="display: none;"> Previous research on multimodal entity linking (MEL) has primarily employed contrastive learning as the primary objective. However, using the rest of the batch as negative samples without careful consideration, these studies risk leveraging easy features and potentially overlook essential details that make entities unique. In this work, we propose JD-CCL (Jaccard Distance-based Conditional Contrastive Learning), a novel approach designed to enhance the ability to match multimodal entity linking models. JD-CCL leverages meta-information to select negative samples with similar attributes, making the linking task more challenging and robust. Additionally, to address the limitations caused by the variations within the visual modality among mentions and entities, we introduce a novel method, CVaCPT (Contextual Visual-aid Controllable Patch Transform). It enhances visual representations by incorporating multi-view synthetic images and contextual textual representations to scale and shift patch representations. Experimental results on benchmark MEL datasets demonstrate the strong effectiveness of our approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14166v1-abstract-full').style.display = 'none'; document.getElementById('2501.14166v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13920">arXiv:2501.13920</a> <span> [<a href="https://arxiv.org/pdf/2501.13920">pdf</a>, <a href="https://arxiv.org/format/2501.13920">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> IMAGINE-E: Image Generation Intelligence Evaluation of State-of-the-art Text-to-Image Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lei%2C+J">Jiayi Lei</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Renrui Zhang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xiangfei Hu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+W">Weifeng Lin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhen Li</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+W">Wenjian Sun</a>, <a href="/search/cs?searchtype=author&query=Du%2C+R">Ruoyi Du</a>, <a href="/search/cs?searchtype=author&query=Zhuo%2C+L">Le Zhuo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhongyu Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xinyue Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shitian Zhao</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Ziyu Guo</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yiting Lu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+P">Peng Gao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hongsheng Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13920v1-abstract-short" style="display: inline;"> With the rapid development of diffusion models, text-to-image(T2I) models have made significant progress, showcasing impressive abilities in prompt following and image generation. Recently launched models such as FLUX.1 and Ideogram2.0, along with others like Dall-E3 and Stable Diffusion 3, have demonstrated exceptional performance across various complex tasks, raising questions about whether T2I… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13920v1-abstract-full').style.display = 'inline'; document.getElementById('2501.13920v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13920v1-abstract-full" style="display: none;"> With the rapid development of diffusion models, text-to-image(T2I) models have made significant progress, showcasing impressive abilities in prompt following and image generation. Recently launched models such as FLUX.1 and Ideogram2.0, along with others like Dall-E3 and Stable Diffusion 3, have demonstrated exceptional performance across various complex tasks, raising questions about whether T2I models are moving towards general-purpose applicability. Beyond traditional image generation, these models exhibit capabilities across a range of fields, including controllable generation, image editing, video, audio, 3D, and motion generation, as well as computer vision tasks like semantic segmentation and depth estimation. However, current evaluation frameworks are insufficient to comprehensively assess these models' performance across expanding domains. To thoroughly evaluate these models, we developed the IMAGINE-E and tested six prominent models: FLUX.1, Ideogram2.0, Midjourney, Dall-E3, Stable Diffusion 3, and Jimeng. Our evaluation is divided into five key domains: structured output generation, realism, and physical consistency, specific domain generation, challenging scenario generation, and multi-style creation tasks. This comprehensive assessment highlights each model's strengths and limitations, particularly the outstanding performance of FLUX.1 and Ideogram2.0 in structured and specific domain tasks, underscoring the expanding applications and potential of T2I models as foundational AI tools. This study provides valuable insights into the current state and future trajectory of T2I models as they evolve towards general-purpose usability. Evaluation scripts will be released at https://github.com/jylei16/Imagine-e. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13920v1-abstract-full').style.display = 'none'; document.getElementById('2501.13920v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">75 pages, 73 figures, Evaluation scripts: https://github.com/jylei16/Imagine-e</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13020">arXiv:2501.13020</a> <span> [<a href="https://arxiv.org/pdf/2501.13020">pdf</a>, <a href="https://arxiv.org/format/2501.13020">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Characterizing Collective Efforts in Content Sharing and Quality Control for ADHD-relevant Content on Video-sharing Platforms </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+H+%27">Hanxiu 'Hazel' Zhu</a>, <a href="/search/cs?searchtype=author&query=Kumar%2C+A+S">Avanthika Senthil Kumar</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Sihang Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ru Wang</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+X">Xin Tong</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yuhang Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13020v1-abstract-short" style="display: inline;"> Video-sharing platforms (VSPs) have become increasingly important for individuals with ADHD to recognize symptoms, acquire knowledge, and receive support. While videos offer rich information and high engagement, they also present unique challenges, such as information quality and accessibility issues to users with ADHD. However, little work has thoroughly examined the video content quality and acc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13020v1-abstract-full').style.display = 'inline'; document.getElementById('2501.13020v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13020v1-abstract-full" style="display: none;"> Video-sharing platforms (VSPs) have become increasingly important for individuals with ADHD to recognize symptoms, acquire knowledge, and receive support. While videos offer rich information and high engagement, they also present unique challenges, such as information quality and accessibility issues to users with ADHD. However, little work has thoroughly examined the video content quality and accessibility issues, the impact, and the control strategies in the ADHD community. We fill this gap by systematically collecting 373 ADHD-relevant videos with comments from YouTube and TikTok and analyzing the data with a mixed method. Our study identified the characteristics of ADHD-relevant videos on VSPs (e.g., creator types, video presentation forms, quality issues) and revealed the collective efforts of creators and viewers in video quality control, such as authority building, collective quality checking, and accessibility improvement. We further derive actionable design implications for VSPs to offer more reliable and ADHD-friendly contents. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13020v1-abstract-full').style.display = 'none'; document.getElementById('2501.13020v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12844">arXiv:2501.12844</a> <span> [<a href="https://arxiv.org/pdf/2501.12844">pdf</a>, <a href="https://arxiv.org/format/2501.12844">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> GAMED-Snake: Gradient-aware Adaptive Momentum Evolution Deep Snake Model for Multi-organ Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruicheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Haowei Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zeyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+P">Puxin Yan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shen Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12844v1-abstract-short" style="display: inline;"> Multi-organ segmentation is a critical yet challenging task due to complex anatomical backgrounds, blurred boundaries, and diverse morphologies. This study introduces the Gradient-aware Adaptive Momentum Evolution Deep Snake (GAMED-Snake) model, which establishes a novel paradigm for contour-based segmentation by integrating gradient-based learning with adaptive momentum evolution mechanisms. The… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12844v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12844v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12844v1-abstract-full" style="display: none;"> Multi-organ segmentation is a critical yet challenging task due to complex anatomical backgrounds, blurred boundaries, and diverse morphologies. This study introduces the Gradient-aware Adaptive Momentum Evolution Deep Snake (GAMED-Snake) model, which establishes a novel paradigm for contour-based segmentation by integrating gradient-based learning with adaptive momentum evolution mechanisms. The GAMED-Snake model incorporates three major innovations: First, the Distance Energy Map Prior (DEMP) generates a pixel-level force field that effectively attracts contour points towards the true boundaries, even in scenarios with complex backgrounds and blurred edges. Second, the Differential Convolution Inception Module (DCIM) precisely extracts comprehensive energy gradients, significantly enhancing segmentation accuracy. Third, the Adaptive Momentum Evolution Mechanism (AMEM) employs cross-attention to establish dynamic features across different iterations of evolution, enabling precise boundary alignment for diverse morphologies. Experimental results on four challenging multi-organ segmentation datasets demonstrate that GAMED-Snake improves the mDice metric by approximately 2% compared to state-of-the-art methods. Code will be available at https://github.com/SYSUzrc/GAMED-Snake. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12844v1-abstract-full').style.display = 'none'; document.getElementById('2501.12844v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.10811">arXiv:2501.10811</a> <span> [<a href="https://arxiv.org/pdf/2501.10811">pdf</a>, <a href="https://arxiv.org/format/2501.10811">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> MusicEval: A Generative Music Corpus with Expert Ratings for Automatic Text-to-Music Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+C">Cheng Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hui Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jinghua Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shiwan Zhao</a>, <a href="/search/cs?searchtype=author&query=Bu%2C+H">Hui Bu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xin Xu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jiaming Zhou</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Haoqin Sun</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+Y">Yong Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.10811v1-abstract-short" style="display: inline;"> The technology for generating music from textual descriptions has seen rapid advancements. However, evaluating text-to-music (TTM) systems remains a significant challenge, primarily due to the difficulty of balancing performance and cost with existing objective and subjective evaluation methods. In this paper, we propose an automatic assessment task for TTM models to align with human perception. T… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10811v1-abstract-full').style.display = 'inline'; document.getElementById('2501.10811v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.10811v1-abstract-full" style="display: none;"> The technology for generating music from textual descriptions has seen rapid advancements. However, evaluating text-to-music (TTM) systems remains a significant challenge, primarily due to the difficulty of balancing performance and cost with existing objective and subjective evaluation methods. In this paper, we propose an automatic assessment task for TTM models to align with human perception. To address the TTM evaluation challenges posed by the professional requirements of music evaluation and the complexity of the relationship between text and music, we collect MusicEval, the first generative music assessment dataset. This dataset contains 2,748 music clips generated by 31 advanced and widely used models in response to 384 text prompts, along with 13,740 ratings from 14 music experts. Furthermore, we design a CLAP-based assessment model built on this dataset, and our experimental results validate the feasibility of the proposed task, providing a valuable reference for future development in TTM evaluation. The dataset is available at https://www.aishelltech.com/AISHELL_7A. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10811v1-abstract-full').style.display = 'none'; document.getElementById('2501.10811v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.10052">arXiv:2501.10052</a> <span> [<a href="https://arxiv.org/pdf/2501.10052">pdf</a>, <a href="https://arxiv.org/format/2501.10052">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Conditional Latent Diffusion-Based Speech Enhancement Via Dual Context Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shengkui Zhao</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+Z">Zexu Pan</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+K">Kun Zhou</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yukun Ma</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chong Zhang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+B">Bin Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.10052v1-abstract-short" style="display: inline;"> Recently, the application of diffusion probabilistic models has advanced speech enhancement through generative approaches. However, existing diffusion-based methods have focused on the generation process in high-dimensional waveform or spectral domains, leading to increased generation complexity and slower inference speeds. Additionally, these methods have primarily modelled clean speech distribut… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10052v1-abstract-full').style.display = 'inline'; document.getElementById('2501.10052v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.10052v1-abstract-full" style="display: none;"> Recently, the application of diffusion probabilistic models has advanced speech enhancement through generative approaches. However, existing diffusion-based methods have focused on the generation process in high-dimensional waveform or spectral domains, leading to increased generation complexity and slower inference speeds. Additionally, these methods have primarily modelled clean speech distributions, with limited exploration of noise distributions, thereby constraining the discriminative capability of diffusion models for speech enhancement. To address these issues, we propose a novel approach that integrates a conditional latent diffusion model (cLDM) with dual-context learning (DCL). Our method utilizes a variational autoencoder (VAE) to compress mel-spectrograms into a low-dimensional latent space. We then apply cLDM to transform the latent representations of both clean speech and background noise into Gaussian noise by the DCL process, and a parameterized model is trained to reverse this process, conditioned on noisy latent representations and text embeddings. By operating in a lower-dimensional space, the latent representations reduce the complexity of the generation process, while the DCL process enhances the model's ability to handle diverse and unseen noise environments. Our experiments demonstrate the strong performance of the proposed approach compared to existing diffusion-based methods, even with fewer iterative steps, and highlight the superior generalization capability of our models to out-of-domain noise datasets (https://github.com/modelscope/ClearerVoice-Studio). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10052v1-abstract-full').style.display = 'none'; document.getElementById('2501.10052v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 1 figure, accepted by ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.10045">arXiv:2501.10045</a> <span> [<a href="https://arxiv.org/pdf/2501.10045">pdf</a>, <a href="https://arxiv.org/format/2501.10045">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> HiFi-SR: A Unified Generative Transformer-Convolutional Adversarial Network for High-Fidelity Speech Super-Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shengkui Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+K">Kun Zhou</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+Z">Zexu Pan</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yukun Ma</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chong Zhang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+B">Bin Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.10045v1-abstract-short" style="display: inline;"> The application of generative adversarial networks (GANs) has recently advanced speech super-resolution (SR) based on intermediate representations like mel-spectrograms. However, existing SR methods that typically rely on independently trained and concatenated networks may lead to inconsistent representations and poor speech quality, especially in out-of-domain scenarios. In this work, we propose… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10045v1-abstract-full').style.display = 'inline'; document.getElementById('2501.10045v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.10045v1-abstract-full" style="display: none;"> The application of generative adversarial networks (GANs) has recently advanced speech super-resolution (SR) based on intermediate representations like mel-spectrograms. However, existing SR methods that typically rely on independently trained and concatenated networks may lead to inconsistent representations and poor speech quality, especially in out-of-domain scenarios. In this work, we propose HiFi-SR, a unified network that leverages end-to-end adversarial training to achieve high-fidelity speech super-resolution. Our model features a unified transformer-convolutional generator designed to seamlessly handle both the prediction of latent representations and their conversion into time-domain waveforms. The transformer network serves as a powerful encoder, converting low-resolution mel-spectrograms into latent space representations, while the convolutional network upscales these representations into high-resolution waveforms. To enhance high-frequency fidelity, we incorporate a multi-band, multi-scale time-frequency discriminator, along with a multi-scale mel-reconstruction loss in the adversarial training process. HiFi-SR is versatile, capable of upscaling any input speech signal between 4 kHz and 32 kHz to a 48 kHz sampling rate. Experimental results demonstrate that HiFi-SR significantly outperforms existing speech SR methods across both objective metrics and ABX preference tests, for both in-domain and out-of-domain scenarios (https://github.com/modelscope/ClearerVoice-Studio). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10045v1-abstract-full').style.display = 'none'; document.getElementById('2501.10045v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 5 figures, accepted by ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.09246">arXiv:2501.09246</a> <span> [<a href="https://arxiv.org/pdf/2501.09246">pdf</a>, <a href="https://arxiv.org/format/2501.09246">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Practical Spoofing Attacks on Galileo Open Service Navigation Message Authentication </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haiyang Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuanyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xinghui Zhu</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Ji He</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shuangtrui Zhao</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yulong Shen</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xiaohong Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.09246v1-abstract-short" style="display: inline;"> This paper examines the Galileo Open Service Navigation Message Authentication (OSNMA) and, for the first time, discovers two critical vulnerabilities, namely artificially-manipulated time synchronization (ATS) and interruptible message authentication (IMA). ATS allows attackers falsify a receiver's signals and/or local reference time (LRT) while still fulfilling the time synchronization (TS) requ… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09246v1-abstract-full').style.display = 'inline'; document.getElementById('2501.09246v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.09246v1-abstract-full" style="display: none;"> This paper examines the Galileo Open Service Navigation Message Authentication (OSNMA) and, for the first time, discovers two critical vulnerabilities, namely artificially-manipulated time synchronization (ATS) and interruptible message authentication (IMA). ATS allows attackers falsify a receiver's signals and/or local reference time (LRT) while still fulfilling the time synchronization (TS) requirement. IMA allows temporary interruption of the navigation data authentication process due to the reception of a broken message (probably caused by spoofing attacks) and restores the authentication later. By exploiting the ATS vulnerability, we propose a TS-comply replay (TSR) attack with two variants (real-time and non-real-time), where attackers replay signals to a victim receiver while strictly complying with the TS rule. We further propose a TS-comply forgery (TSF) attack, where attackers first use a previously-disclosed key to forge a message based on the OSNMA protocol, then tamper with the vitcim receiver's LRT correspondingly to comply with the TS rule and finally transmit the forged message to the receiver. Finally, we propose a concatenating replay (CR) attack based on the IMA vulnerability, where attackers concatenate replayed signals to the victim receiver's signals in a way that still enables correct verification of the navigation data in the replayed signals. To validate the effectiveness of the proposed attacks, we conduct real-world experiments with a commercial Galileo receiver manufactured by Septentrio, two software-defined radio (SDR) devices, open-source Galileo-SDR-SIM and OSNMAlib software. The results showed that all the attacks can successfully pass the OSNMA scheme and the TSF attack can spoof receivers to arbitrary locations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09246v1-abstract-full').style.display = 'none'; document.getElementById('2501.09246v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 20 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.08585">arXiv:2501.08585</a> <span> [<a href="https://arxiv.org/pdf/2501.08585">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Systematic Review of Machine Learning Methods for Multimodal EEG Data in Clinical Application </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Siqi Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wangyang Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiru Wang</a>, <a href="/search/cs?searchtype=author&query=Foglia%2C+S">Stevie Foglia</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+H">Hongzhao Tan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bohan Zhang</a>, <a href="/search/cs?searchtype=author&query=Hamoodi%2C+A">Ameer Hamoodi</a>, <a href="/search/cs?searchtype=author&query=Nelson%2C+A">Aimee Nelson</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhen Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.08585v1-abstract-short" style="display: inline;"> Machine learning (ML) and deep learning (DL) techniques have been widely applied to analyze electroencephalography (EEG) signals for disease diagnosis and brain-computer interfaces (BCI). The integration of multimodal data has been shown to enhance the accuracy of ML and DL models. Combining EEG with other modalities can improve clinical decision-making by addressing complex tasks in clinical popu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08585v1-abstract-full').style.display = 'inline'; document.getElementById('2501.08585v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.08585v1-abstract-full" style="display: none;"> Machine learning (ML) and deep learning (DL) techniques have been widely applied to analyze electroencephalography (EEG) signals for disease diagnosis and brain-computer interfaces (BCI). The integration of multimodal data has been shown to enhance the accuracy of ML and DL models. Combining EEG with other modalities can improve clinical decision-making by addressing complex tasks in clinical populations. This systematic literature review explores the use of multimodal EEG data in ML and DL models for clinical applications. A comprehensive search was conducted across PubMed, Web of Science, and Google Scholar, yielding 16 relevant studies after three rounds of filtering. These studies demonstrate the application of multimodal EEG data in addressing clinical challenges, including neuropsychiatric disorders, neurological conditions (e.g., seizure detection), neurodevelopmental disorders (e.g., autism spectrum disorder), and sleep stage classification. Data fusion occurred at three levels: signal, feature, and decision levels. The most commonly used ML models were support vector machines (SVM) and decision trees. Notably, 11 out of the 16 studies reported improvements in model accuracy with multimodal EEG data. This review highlights the potential of multimodal EEG-based ML models in enhancing clinical diagnostics and problem-solving. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08585v1-abstract-full').style.display = 'none'; document.getElementById('2501.08585v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper includes 4 figures, 6 tables, and totals 18 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06151">arXiv:2501.06151</a> <span> [<a href="https://arxiv.org/pdf/2501.06151">pdf</a>, <a href="https://arxiv.org/format/2501.06151">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PySpatial: A High-Speed Whole Slide Image Pathomics Toolkit </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yuechen Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yu Wang</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+T">Tianyuan Yao</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+R">Ruining Deng</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+M">Mengmeng Yin</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shilin Zhao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Haichun Yang</a>, <a href="/search/cs?searchtype=author&query=Huo%2C+Y">Yuankai Huo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06151v1-abstract-short" style="display: inline;"> Whole Slide Image (WSI) analysis plays a crucial role in modern digital pathology, enabling large-scale feature extraction from tissue samples. However, traditional feature extraction pipelines based on tools like CellProfiler often involve lengthy workflows, requiring WSI segmentation into patches, feature extraction at the patch level, and subsequent mapping back to the original WSI. To address… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06151v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06151v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06151v1-abstract-full" style="display: none;"> Whole Slide Image (WSI) analysis plays a crucial role in modern digital pathology, enabling large-scale feature extraction from tissue samples. However, traditional feature extraction pipelines based on tools like CellProfiler often involve lengthy workflows, requiring WSI segmentation into patches, feature extraction at the patch level, and subsequent mapping back to the original WSI. To address these challenges, we present PySpatial, a high-speed pathomics toolkit specifically designed for WSI-level analysis. PySpatial streamlines the conventional pipeline by directly operating on computational regions of interest, reducing redundant processing steps. Utilizing rtree-based spatial indexing and matrix-based computation, PySpatial efficiently maps and processes computational regions, significantly accelerating feature extraction while maintaining high accuracy. Our experiments on two datasets-Perivascular Epithelioid Cell (PEC) and data from the Kidney Precision Medicine Project (KPMP)-demonstrate substantial performance improvements. For smaller and sparse objects in PEC datasets, PySpatial achieves nearly a 10-fold speedup compared to standard CellProfiler pipelines. For larger objects, such as glomeruli and arteries in KPMP datasets, PySpatial achieves a 2-fold speedup. These results highlight PySpatial's potential to handle large-scale WSI analysis with enhanced efficiency and accuracy, paving the way for broader applications in digital pathology. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06151v1-abstract-full').style.display = 'none'; document.getElementById('2501.06151v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.04931">arXiv:2501.04931</a> <span> [<a href="https://arxiv.org/pdf/2501.04931">pdf</a>, <a href="https://arxiv.org/format/2501.04931">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Jailbreaking Multimodal Large Language Models via Shuffle Inconsistency </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shiji Zhao</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+R">Ranjie Duan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Fengxiang Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chi Chen</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+C">Caixin Kang</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+J">Jialing Tao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">YueFeng Chen</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+H">Hui Xue</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+X">Xingxing Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.04931v1-abstract-short" style="display: inline;"> Multimodal Large Language Models (MLLMs) have achieved impressive performance and have been put into practical use in commercial applications, but they still have potential safety mechanism vulnerabilities. Jailbreak attacks are red teaming methods that aim to bypass safety mechanisms and discover MLLMs' potential risks. Existing MLLMs' jailbreak methods often bypass the model's safety mechanism t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04931v1-abstract-full').style.display = 'inline'; document.getElementById('2501.04931v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.04931v1-abstract-full" style="display: none;"> Multimodal Large Language Models (MLLMs) have achieved impressive performance and have been put into practical use in commercial applications, but they still have potential safety mechanism vulnerabilities. Jailbreak attacks are red teaming methods that aim to bypass safety mechanisms and discover MLLMs' potential risks. Existing MLLMs' jailbreak methods often bypass the model's safety mechanism through complex optimization methods or carefully designed image and text prompts. Despite achieving some progress, they have a low attack success rate on commercial closed-source MLLMs. Unlike previous research, we empirically find that there exists a Shuffle Inconsistency between MLLMs' comprehension ability and safety ability for the shuffled harmful instruction. That is, from the perspective of comprehension ability, MLLMs can understand the shuffled harmful text-image instructions well. However, they can be easily bypassed by the shuffled harmful instructions from the perspective of safety ability, leading to harmful responses. Then we innovatively propose a text-image jailbreak attack named SI-Attack. Specifically, to fully utilize the Shuffle Inconsistency and overcome the shuffle randomness, we apply a query-based black-box optimization method to select the most harmful shuffled inputs based on the feedback of the toxic judge model. A series of experiments show that SI-Attack can improve the attack's performance on three benchmarks. In particular, SI-Attack can obviously improve the attack success rate for commercial MLLMs such as GPT-4o or Claude-3.5-Sonnet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04931v1-abstract-full').style.display = 'none'; document.getElementById('2501.04931v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.04489">arXiv:2501.04489</a> <span> [<a href="https://arxiv.org/pdf/2501.04489">pdf</a>, <a href="https://arxiv.org/format/2501.04489">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Collaborative Inference Acceleration with Non-Penetrative Tensor Partitioning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhibang Liu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chaonong Xu</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+Z">Zhenjie Lv</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhizhuo Liu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Suyu Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.04489v1-abstract-short" style="display: inline;"> The inference of large-sized images on Internet of Things (IoT) devices is commonly hindered by limited resources, while there are often stringent latency requirements for Deep Neural Network (DNN) inference. Currently, this problem is generally addressed by collaborative inference, where the large-sized image is partitioned into multiple tiles, and each tile is assigned to an IoT device for proce… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04489v1-abstract-full').style.display = 'inline'; document.getElementById('2501.04489v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.04489v1-abstract-full" style="display: none;"> The inference of large-sized images on Internet of Things (IoT) devices is commonly hindered by limited resources, while there are often stringent latency requirements for Deep Neural Network (DNN) inference. Currently, this problem is generally addressed by collaborative inference, where the large-sized image is partitioned into multiple tiles, and each tile is assigned to an IoT device for processing. However, since significant latency will be incurred due to the communication overhead caused by tile sharing, the existing collaborative inference strategy is inefficient for convolutional computation, which is indispensable for any DNN. To reduce it, we propose Non-Penetrative Tensor Partitioning (NPTP), a fine-grained tensor partitioning method that reduces the communication latency by minimizing the communication load of tiles shared, thereby reducing inference latency. We evaluate NPTP with four widely-adopted DNN models. Experimental results demonstrate that NPTP achieves a 1.44-1.68x inference speedup relative to CoEdge, a state-of-the-art (SOTA) collaborative inference algorithm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04489v1-abstract-full').style.display = 'none'; document.getElementById('2501.04489v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by the 2025 IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP 2025)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.04416">arXiv:2501.04416</a> <span> [<a href="https://arxiv.org/pdf/2501.04416">pdf</a>, <a href="https://arxiv.org/format/2501.04416">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> ZSVC: Zero-shot Style Voice Conversion with Disentangled Latent Diffusion Models and Adversarial Training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xinfa Zhu</a>, <a href="/search/cs?searchtype=author&query=He%2C+L">Lei He</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Y">Yujia Xiao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xi Wang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+X">Xu Tan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Sheng Zhao</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+L">Lei Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.04416v1-abstract-short" style="display: inline;"> Style voice conversion aims to transform the speaking style of source speech into a desired style while keeping the original speaker's identity. However, previous style voice conversion approaches primarily focus on well-defined domains such as emotional aspects, limiting their practical applications. In this study, we present ZSVC, a novel Zero-shot Style Voice Conversion approach that utilizes a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04416v1-abstract-full').style.display = 'inline'; document.getElementById('2501.04416v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.04416v1-abstract-full" style="display: none;"> Style voice conversion aims to transform the speaking style of source speech into a desired style while keeping the original speaker's identity. However, previous style voice conversion approaches primarily focus on well-defined domains such as emotional aspects, limiting their practical applications. In this study, we present ZSVC, a novel Zero-shot Style Voice Conversion approach that utilizes a speech codec and a latent diffusion model with speech prompting mechanism to facilitate in-context learning for speaking style conversion. To disentangle speaking style and speaker timbre, we introduce information bottleneck to filter speaking style in the source speech and employ Uncertainty Modeling Adaptive Instance Normalization (UMAdaIN) to perturb the speaker timbre in the style prompt. Moreover, we propose a novel adversarial training strategy to enhance in-context learning and improve style similarity. Experiments conducted on 44,000 hours of speech data demonstrate the superior performance of ZSVC in generating speech with diverse speaking styles in zero-shot scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04416v1-abstract-full').style.display = 'none'; document.getElementById('2501.04416v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages, 3 figures, accepted by ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.04336">arXiv:2501.04336</a> <span> [<a href="https://arxiv.org/pdf/2501.04336">pdf</a>, <a href="https://arxiv.org/format/2501.04336">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Building a Mind Palace: Structuring Environment-Grounded Semantic Graphs for Effective Long Video Analysis with LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zeyi Huang</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+Y">Yuyang Ji</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaofang Wang</a>, <a href="/search/cs?searchtype=author&query=Mehta%2C+N">Nikhil Mehta</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Tong Xiao</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+D">Donghyun Lee</a>, <a href="/search/cs?searchtype=author&query=Vanvalkenburgh%2C+S">Sigmund Vanvalkenburgh</a>, <a href="/search/cs?searchtype=author&query=Zha%2C+S">Shengxin Zha</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+B">Bolin Lai</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+L">Licheng Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+N">Ning Zhang</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+Y+J">Yong Jae Lee</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Miao Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.04336v1-abstract-short" style="display: inline;"> Long-form video understanding with Large Vision Language Models is challenged by the need to analyze temporally dispersed yet spatially concentrated key moments within limited context windows. In this work, we introduce VideoMindPalace, a new framework inspired by the "Mind Palace", which organizes critical video moments into a topologically structured semantic graph. VideoMindPalace organizes key… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04336v1-abstract-full').style.display = 'inline'; document.getElementById('2501.04336v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.04336v1-abstract-full" style="display: none;"> Long-form video understanding with Large Vision Language Models is challenged by the need to analyze temporally dispersed yet spatially concentrated key moments within limited context windows. In this work, we introduce VideoMindPalace, a new framework inspired by the "Mind Palace", which organizes critical video moments into a topologically structured semantic graph. VideoMindPalace organizes key information through (i) hand-object tracking and interaction, (ii) clustered activity zones representing specific areas of recurring activities, and (iii) environment layout mapping, allowing natural language parsing by LLMs to provide grounded insights on spatio-temporal and 3D context. In addition, we propose the Video MindPalace Benchmark (VMB), to assess human-like reasoning, including spatial localization, temporal reasoning, and layout-aware sequential understanding. Evaluated on VMB and established video QA datasets, including EgoSchema, NExT-QA, IntentQA, and the Active Memories Benchmark, VideoMindPalace demonstrates notable gains in spatio-temporal coherence and human-aligned reasoning, advancing long-form video analysis capabilities in VLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04336v1-abstract-full').style.display = 'none'; document.getElementById('2501.04336v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03764">arXiv:2501.03764</a> <span> [<a href="https://arxiv.org/pdf/2501.03764">pdf</a>, <a href="https://arxiv.org/format/2501.03764">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SelectiveFinetuning: Enhancing Transfer Learning in Sleep Staging through Selective Domain Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Siyuan Zhao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chenyu Liu</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Y">Yi Ding</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xinliang Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03764v1-abstract-short" style="display: inline;"> In practical sleep stage classification, a key challenge is the variability of EEG data across different subjects and environments. Differences in physiology, age, health status, and recording conditions can lead to domain shifts between data. These domain shifts often result in decreased model accuracy and reliability, particularly when the model is applied to new data with characteristics differ… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03764v1-abstract-full').style.display = 'inline'; document.getElementById('2501.03764v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03764v1-abstract-full" style="display: none;"> In practical sleep stage classification, a key challenge is the variability of EEG data across different subjects and environments. Differences in physiology, age, health status, and recording conditions can lead to domain shifts between data. These domain shifts often result in decreased model accuracy and reliability, particularly when the model is applied to new data with characteristics different from those it was originally trained on, which is a typical manifestation of negative transfer. To address this, we propose SelectiveFinetuning in this paper. Our method utilizes a pretrained Multi Resolution Convolutional Neural Network (MRCNN) to extract EEG features, capturing the distinctive characteristics of different sleep stages. To mitigate the effect of domain shifts, we introduce a domain aligning mechanism that employs Earth Mover Distance (EMD) to evaluate and select source domain data closely matching the target domain. By finetuning the model with selective source data, our SelectiveFinetuning enhances the model's performance on target domain that exhibits domain shifts compared to the data used for training. Experimental results show that our method outperforms existing baselines, offering greater robustness and adaptability in practical scenarios where data distributions are often unpredictable. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03764v1-abstract-full').style.display = 'none'; document.getElementById('2501.03764v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02749">arXiv:2501.02749</a> <span> [<a href="https://arxiv.org/pdf/2501.02749">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Robot Route Optimization in Smart Logistics with Transformer and GNN Integration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+H">Hao Luo</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+J">Jianjun Wei</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shuchen Zhao</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+A">Ankai Liang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhongjin Xu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+R">Ruxue Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02749v1-abstract-short" style="display: inline;"> This research delves into advanced route optimization for robots in smart logistics, leveraging a fusion of Transformer architectures, Graph Neural Networks (GNNs), and Generative Adversarial Networks (GANs). The approach utilizes a graph-based representation encompassing geographical data, cargo allocation, and robot dynamics, addressing both spatial and resource limitations to refine route effic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02749v1-abstract-full').style.display = 'inline'; document.getElementById('2501.02749v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02749v1-abstract-full" style="display: none;"> This research delves into advanced route optimization for robots in smart logistics, leveraging a fusion of Transformer architectures, Graph Neural Networks (GNNs), and Generative Adversarial Networks (GANs). The approach utilizes a graph-based representation encompassing geographical data, cargo allocation, and robot dynamics, addressing both spatial and resource limitations to refine route efficiency. Through extensive testing with authentic logistics datasets, the proposed method achieves notable improvements, including a 15% reduction in travel distance, a 20% boost in time efficiency, and a 10% decrease in energy consumption. These findings highlight the algorithm's effectiveness, promoting enhanced performance in intelligent logistics operations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02749v1-abstract-full').style.display = 'none'; document.getElementById('2501.02749v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.01821">arXiv:2501.01821</a> <span> [<a href="https://arxiv.org/pdf/2501.01821">pdf</a>, <a href="https://arxiv.org/format/2501.01821">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> SDPO: Segment-Level Direct Preference Optimization for Social Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kong%2C+A">Aobo Kong</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+W">Wentao Ma</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shiwan Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yongbin Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yuchuan Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Ke Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaoqian Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qicheng Li</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+Y">Yong Qin</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+F">Fei Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.01821v1-abstract-short" style="display: inline;"> Social agents powered by large language models (LLMs) can simulate human social behaviors but fall short in handling complex goal-oriented social dialogues. Direct Preference Optimization (DPO) has proven effective in aligning LLM behavior with human preferences across a variety of agent tasks. Existing DPO-based approaches for multi-turn interactions are divided into turn-level and session-level… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01821v1-abstract-full').style.display = 'inline'; document.getElementById('2501.01821v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.01821v1-abstract-full" style="display: none;"> Social agents powered by large language models (LLMs) can simulate human social behaviors but fall short in handling complex goal-oriented social dialogues. Direct Preference Optimization (DPO) has proven effective in aligning LLM behavior with human preferences across a variety of agent tasks. Existing DPO-based approaches for multi-turn interactions are divided into turn-level and session-level methods. The turn-level method is overly fine-grained, focusing exclusively on individual turns, while session-level methods are too coarse-grained, often introducing training noise. To address these limitations, we propose Segment-Level Direct Preference Optimization (SDPO), which focuses on specific key segments within interactions to optimize multi-turn agent behavior while minimizing training noise. Evaluations on the SOTOPIA benchmark demonstrate that SDPO-tuned agents consistently outperform both existing DPO-based methods and proprietary LLMs like GPT-4o, underscoring SDPO's potential to advance the social intelligence of LLM-based agents. We release our code and data at https://github.com/AlibabaResearch/DAMO-ConvAI/tree/main/SDPO. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01821v1-abstract-full').style.display = 'none'; document.getElementById('2501.01821v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.00192">arXiv:2501.00192</a> <span> [<a href="https://arxiv.org/pdf/2501.00192">pdf</a>, <a href="https://arxiv.org/format/2501.00192">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MLLM-as-a-Judge for Image Safety without Human Labeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhenting Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shuming Hu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shiyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+X">Xiaowen Lin</a>, <a href="/search/cs?searchtype=author&query=Juefei-Xu%2C+F">Felix Juefei-Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhuowei Li</a>, <a href="/search/cs?searchtype=author&query=Han%2C+L">Ligong Han</a>, <a href="/search/cs?searchtype=author&query=Subramanyam%2C+H">Harihar Subramanyam</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Li Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jianfa Chen</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+N">Nan Jiang</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+L">Lingjuan Lyu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+S">Shiqing Ma</a>, <a href="/search/cs?searchtype=author&query=Metaxas%2C+D+N">Dimitris N. Metaxas</a>, <a href="/search/cs?searchtype=author&query=Jain%2C+A">Ankit Jain</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.00192v1-abstract-short" style="display: inline;"> Image content safety has become a significant challenge with the rise of visual media on online platforms. Meanwhile, in the age of AI-generated content (AIGC), many image generation models are capable of producing harmful content, such as images containing sexual or violent material. Thus, it becomes crucial to identify such unsafe images based on established safety rules. Pre-trained Multimodal… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00192v1-abstract-full').style.display = 'inline'; document.getElementById('2501.00192v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.00192v1-abstract-full" style="display: none;"> Image content safety has become a significant challenge with the rise of visual media on online platforms. Meanwhile, in the age of AI-generated content (AIGC), many image generation models are capable of producing harmful content, such as images containing sexual or violent material. Thus, it becomes crucial to identify such unsafe images based on established safety rules. Pre-trained Multimodal Large Language Models (MLLMs) offer potential in this regard, given their strong pattern recognition abilities. Existing approaches typically fine-tune MLLMs with human-labeled datasets, which however brings a series of drawbacks. First, relying on human annotators to label data following intricate and detailed guidelines is both expensive and labor-intensive. Furthermore, users of safety judgment systems may need to frequently update safety rules, making fine-tuning on human-based annotation more challenging. This raises the research question: Can we detect unsafe images by querying MLLMs in a zero-shot setting using a predefined safety constitution (a set of safety rules)? Our research showed that simply querying pre-trained MLLMs does not yield satisfactory results. This lack of effectiveness stems from factors such as the subjectivity of safety rules, the complexity of lengthy constitutions, and the inherent biases in the models. To address these challenges, we propose a MLLM-based method includes objectifying safety rules, assessing the relevance between rules and images, making quick judgments based on debiased token probabilities with logically complete yet simplified precondition chains for safety rules, and conducting more in-depth reasoning with cascaded chain-of-thought processes if necessary. Experiment results demonstrate that our method is highly effective for zero-shot image safety judgment tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00192v1-abstract-full').style.display = 'none'; document.getElementById('2501.00192v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.20821">arXiv:2412.20821</a> <span> [<a href="https://arxiv.org/pdf/2412.20821">pdf</a>, <a href="https://arxiv.org/format/2412.20821">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Multimodal Emotion Recognition through Multi-Granularity Cross-Modal Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xuechen Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shiwan Zhao</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Haoqin Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hui Wang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jiaming Zhou</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+Y">Yong Qin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.20821v1-abstract-short" style="display: inline;"> Multimodal emotion recognition (MER), leveraging speech and text, has emerged as a pivotal domain within human-computer interaction, demanding sophisticated methods for effective multimodal integration. The challenge of aligning features across these modalities is significant, with most existing approaches adopting a singular alignment strategy. Such a narrow focus not only limits model performanc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20821v1-abstract-full').style.display = 'inline'; document.getElementById('2412.20821v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.20821v1-abstract-full" style="display: none;"> Multimodal emotion recognition (MER), leveraging speech and text, has emerged as a pivotal domain within human-computer interaction, demanding sophisticated methods for effective multimodal integration. The challenge of aligning features across these modalities is significant, with most existing approaches adopting a singular alignment strategy. Such a narrow focus not only limits model performance but also fails to address the complexity and ambiguity inherent in emotional expressions. In response, this paper introduces a Multi-Granularity Cross-Modal Alignment (MGCMA) framework, distinguished by its comprehensive approach encompassing distribution-based, instance-based, and token-based alignment modules. This framework enables a multi-level perception of emotional information across modalities. Our experiments on IEMOCAP demonstrate that our proposed method outperforms current state-of-the-art techniques. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20821v1-abstract-full').style.display = 'none'; document.getElementById('2412.20821v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICASSP 2025-2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.20484">arXiv:2412.20484</a> <span> [<a href="https://arxiv.org/pdf/2412.20484">pdf</a>, <a href="https://arxiv.org/format/2412.20484">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Exploiting NOMA Transmissions in Multi-UAV-assisted Wireless Networks: From Aerial-RIS to Mode-switching UAVs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Songhan Zhao</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+S">Shimin Gong</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+B">Bo Gu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lanhua Li</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+B">Bin Lyu</a>, <a href="/search/cs?searchtype=author&query=Hoang%2C+D+T">Dinh Thai Hoang</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+C">Changyan Yi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.20484v1-abstract-short" style="display: inline;"> In this paper, we consider an aerial reconfigurable intelligent surface (ARIS)-assisted wireless network, where multiple unmanned aerial vehicles (UAVs) collect data from ground users (GUs) by using the non-orthogonal multiple access (NOMA) method. The ARIS provides enhanced channel controllability to improve the NOMA transmissions and reduce the co-channel interference among UAVs. We also propose… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20484v1-abstract-full').style.display = 'inline'; document.getElementById('2412.20484v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.20484v1-abstract-full" style="display: none;"> In this paper, we consider an aerial reconfigurable intelligent surface (ARIS)-assisted wireless network, where multiple unmanned aerial vehicles (UAVs) collect data from ground users (GUs) by using the non-orthogonal multiple access (NOMA) method. The ARIS provides enhanced channel controllability to improve the NOMA transmissions and reduce the co-channel interference among UAVs. We also propose a novel dual-mode switching scheme, where each UAV equipped with both an ARIS and a radio frequency (RF) transceiver can adaptively perform passive reflection or active transmission. We aim to maximize the overall network throughput by jointly optimizing the UAVs' trajectory planning and operating modes, the ARIS's passive beamforming, and the GUs' transmission control strategies. We propose an optimization-driven hierarchical deep reinforcement learning (O-HDRL) method to decompose it into a series of subproblems. Specifically, the multi-agent deep deterministic policy gradient (MADDPG) adjusts the UAVs' trajectory planning and mode switching strategies, while the passive beamforming and transmission control strategies are tackled by the optimization methods. Numerical results reveal that the O-HDRL efficiently improves the learning stability and reward performance compared to the benchmark methods. Meanwhile, the dual-mode switching scheme is verified to achieve a higher throughput performance compared to the fixed ARIS scheme. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20484v1-abstract-full').style.display = 'none'; document.getElementById('2412.20484v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.19417">arXiv:2412.19417</a> <span> [<a href="https://arxiv.org/pdf/2412.19417">pdf</a>, <a href="https://arxiv.org/format/2412.19417">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> KALAHash: Knowledge-Anchored Low-Resource Adaptation for Deep Hashing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shu Zhao</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+T">Tan Yu</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+X">Xiaoshuai Hao</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+W">Wenchao Ma</a>, <a href="/search/cs?searchtype=author&query=Narayanan%2C+V">Vijaykrishnan Narayanan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.19417v1-abstract-short" style="display: inline;"> Deep hashing has been widely used for large-scale approximate nearest neighbor search due to its storage and search efficiency. However, existing deep hashing methods predominantly rely on abundant training data, leaving the more challenging scenario of low-resource adaptation for deep hashing relatively underexplored. This setting involves adapting pre-trained models to downstream tasks with only… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19417v1-abstract-full').style.display = 'inline'; document.getElementById('2412.19417v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.19417v1-abstract-full" style="display: none;"> Deep hashing has been widely used for large-scale approximate nearest neighbor search due to its storage and search efficiency. However, existing deep hashing methods predominantly rely on abundant training data, leaving the more challenging scenario of low-resource adaptation for deep hashing relatively underexplored. This setting involves adapting pre-trained models to downstream tasks with only an extremely small number of training samples available. Our preliminary benchmarks reveal that current methods suffer significant performance degradation due to the distribution shift caused by limited training samples. To address these challenges, we introduce Class-Calibration LoRA (CLoRA), a novel plug-and-play approach that dynamically constructs low-rank adaptation matrices by leveraging class-level textual knowledge embeddings. CLoRA effectively incorporates prior class knowledge as anchors, enabling parameter-efficient fine-tuning while maintaining the original data distribution. Furthermore, we propose Knowledge-Guided Discrete Optimization (KIDDO), a framework to utilize class knowledge to compensate for the scarcity of visual information and enhance the discriminability of hash codes. Extensive experiments demonstrate that our proposed method, Knowledge- Anchored Low-Resource Adaptation Hashing (KALAHash), significantly boosts retrieval performance and achieves a 4x data efficiency in low-resource scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19417v1-abstract-full').style.display = 'none'; document.getElementById('2412.19417v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18928">arXiv:2412.18928</a> <span> [<a href="https://arxiv.org/pdf/2412.18928">pdf</a>, <a href="https://arxiv.org/format/2412.18928">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> UNIC-Adapter: Unified Image-instruction Adapter with Multi-modal Transformer for Image Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Duan%2C+L">Lunhao Duan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shanshan Zhao</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+W">Wenjun Yan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yinglun Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qing-Guo Chen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhao Xu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+W">Weihua Luo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kaifu Zhang</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+M">Mingming Gong</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+G">Gui-Song Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18928v1-abstract-short" style="display: inline;"> Recently, text-to-image generation models have achieved remarkable advancements, particularly with diffusion models facilitating high-quality image synthesis from textual descriptions. However, these models often struggle with achieving precise control over pixel-level layouts, object appearances, and global styles when using text prompts alone. To mitigate this issue, previous works introduce con… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18928v1-abstract-full').style.display = 'inline'; document.getElementById('2412.18928v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18928v1-abstract-full" style="display: none;"> Recently, text-to-image generation models have achieved remarkable advancements, particularly with diffusion models facilitating high-quality image synthesis from textual descriptions. However, these models often struggle with achieving precise control over pixel-level layouts, object appearances, and global styles when using text prompts alone. To mitigate this issue, previous works introduce conditional images as auxiliary inputs for image generation, enhancing control but typically necessitating specialized models tailored to different types of reference inputs. In this paper, we explore a new approach to unify controllable generation within a single framework. Specifically, we propose the unified image-instruction adapter (UNIC-Adapter) built on the Multi-Modal-Diffusion Transformer architecture, to enable flexible and controllable generation across diverse conditions without the need for multiple specialized models. Our UNIC-Adapter effectively extracts multi-modal instruction information by incorporating both conditional images and task instructions, injecting this information into the image generation process through a cross-attention mechanism enhanced by Rotary Position Embedding. Experimental results across a variety of tasks, including pixel-level spatial control, subject-driven image generation, and style-image-based image synthesis, demonstrate the effectiveness of our UNIC-Adapter in unified controllable image generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18928v1-abstract-full').style.display = 'none'; document.getElementById('2412.18928v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18547">arXiv:2412.18547</a> <span> [<a href="https://arxiv.org/pdf/2412.18547">pdf</a>, <a href="https://arxiv.org/format/2412.18547">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Token-Budget-Aware LLM Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+T">Tingxu Han</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhenting Wang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+C">Chunrong Fang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shiyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+S">Shiqing Ma</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhenyu Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18547v3-abstract-short" style="display: inline;"> Reasoning is critical for large language models (LLMs) to excel in a wide range of tasks. While methods like Chain-of-Thought (CoT) reasoning enhance LLM performance by decomposing problems into intermediate steps, they also incur significant overhead in token usage, leading to increased costs. We find that the reasoning process of current LLMs is unnecessarily lengthy and it can be compressed by… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18547v3-abstract-full').style.display = 'inline'; document.getElementById('2412.18547v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18547v3-abstract-full" style="display: none;"> Reasoning is critical for large language models (LLMs) to excel in a wide range of tasks. While methods like Chain-of-Thought (CoT) reasoning enhance LLM performance by decomposing problems into intermediate steps, they also incur significant overhead in token usage, leading to increased costs. We find that the reasoning process of current LLMs is unnecessarily lengthy and it can be compressed by including a reasonable token budget in the prompt, but the choice of token budget plays a crucial role in the actual compression effectiveness. We then propose a token-budget-aware LLM reasoning framework, which dynamically estimates token budgets for different problems based on reasoning complexity and uses the estimated token budgets to guide the reasoning process. Experiments show that our method effectively reduces token costs in CoT reasoning with only a slight performance reduction, offering a practical solution to balance efficiency and accuracy in LLM reasoning. Code: https://github.com/GeniusHTX/TALE. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18547v3-abstract-full').style.display = 'none'; document.getElementById('2412.18547v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18419">arXiv:2412.18419</a> <span> [<a href="https://arxiv.org/pdf/2412.18419">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Research on the Proximity Relationships of Psychosomatic Disease Knowledge Graph Modules Extracted by Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zihan Zhou</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Z">Ziyi Zeng</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+W">Wenhao Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yihui Zhu</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+J">Jiaxin Mao</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yonggui Yuan</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+M">Min Xia</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shubin Zhao</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+M">Mengyu Yao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yunqian Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18419v1-abstract-short" style="display: inline;"> As social changes accelerate, the incidence of psychosomatic disorders has significantly increased, becoming a major challenge in global health issues. This necessitates an innovative knowledge system and analytical methods to aid in diagnosis and treatment. Here, we establish the ontology model and entity types, using the BERT model and LoRA-tuned LLM for named entity recognition, constructing th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18419v1-abstract-full').style.display = 'inline'; document.getElementById('2412.18419v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18419v1-abstract-full" style="display: none;"> As social changes accelerate, the incidence of psychosomatic disorders has significantly increased, becoming a major challenge in global health issues. This necessitates an innovative knowledge system and analytical methods to aid in diagnosis and treatment. Here, we establish the ontology model and entity types, using the BERT model and LoRA-tuned LLM for named entity recognition, constructing the knowledge graph with 9668 triples. Next, by analyzing the network distances between disease, symptom, and drug modules, it was found that closer network distances among diseases can predict greater similarities in their clinical manifestations, treatment approaches, and psychological mechanisms, and closer distances between symptoms indicate that they are more likely to co-occur. Lastly, by comparing the proximity d and proximity z score, it was shown that symptom-disease pairs in primary diagnostic relationships have a stronger association and are of higher referential value than those in diagnostic relationships. The research results revealed the potential connections between diseases, co-occurring symptoms, and similarities in treatment strategies, providing new perspectives for the diagnosis and treatment of psychosomatic disorders and valuable information for future mental health research and practice. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18419v1-abstract-full').style.display = 'none'; document.getElementById('2412.18419v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18255">arXiv:2412.18255</a> <span> [<a href="https://arxiv.org/pdf/2412.18255">pdf</a>, <a href="https://arxiv.org/format/2412.18255">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> AdaCo: Overcoming Visual Foundation Model Noise in 3D Semantic Segmentation via Adaptive Label Correction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zou%2C+P">Pufan Zou</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shijia Zhao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Weijie Huang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Q">Qiming Xia</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+C">Chenglu Wen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wei Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Cheng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18255v1-abstract-short" style="display: inline;"> Recently, Visual Foundation Models (VFMs) have shown a remarkable generalization performance in 3D perception tasks. However, their effectiveness in large-scale outdoor datasets remains constrained by the scarcity of accurate supervision signals, the extensive noise caused by variable outdoor conditions, and the abundance of unknown objects. In this work, we propose a novel label-free learning met… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18255v1-abstract-full').style.display = 'inline'; document.getElementById('2412.18255v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18255v1-abstract-full" style="display: none;"> Recently, Visual Foundation Models (VFMs) have shown a remarkable generalization performance in 3D perception tasks. However, their effectiveness in large-scale outdoor datasets remains constrained by the scarcity of accurate supervision signals, the extensive noise caused by variable outdoor conditions, and the abundance of unknown objects. In this work, we propose a novel label-free learning method, Adaptive Label Correction (AdaCo), for 3D semantic segmentation. AdaCo first introduces the Cross-modal Label Generation Module (CLGM), providing cross-modal supervision with the formidable interpretive capabilities of the VFMs. Subsequently, AdaCo incorporates the Adaptive Noise Corrector (ANC), updating and adjusting the noisy samples within this supervision iteratively during training. Moreover, we develop an Adaptive Robust Loss (ARL) function to modulate each sample's sensitivity to noisy supervision, preventing potential underfitting issues associated with robust loss. Our proposed AdaCo can effectively mitigate the performance limitations of label-free learning networks in 3D semantic segmentation tasks. Extensive experiments on two outdoor benchmark datasets highlight the superior performance of our method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18255v1-abstract-full').style.display = 'none'; document.getElementById('2412.18255v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">2025 AAAI</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhao%2C+S&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhao%2C+S&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhao%2C+S&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhao%2C+S&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhao%2C+S&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhao%2C+S&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>