CINXE.COM

Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1&ndash;50 of 4,397 results for author: <span class="mathjax">Liu, Z</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&amp;query=Liu%2C+Z">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Liu, Z"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Liu%2C+Z&amp;terms-0-field=author&amp;size=50&amp;order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Liu, Z"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Liu%2C+Z&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Liu%2C+Z&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Liu%2C+Z&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Liu%2C+Z&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Liu%2C+Z&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Liu%2C+Z&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">&hellip;</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18624">arXiv:2411.18624</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.18624">pdf</a>, <a href="https://arxiv.org/format/2411.18624">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GeneMAN: Generalizable Single-Image 3D Human Reconstruction from Multi-Source Human Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+W">Wentao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+H">Hang Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Hong%2C+F">Fangzhou Hong</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+X">Xue Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jianfu Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yizhou Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Ziwei Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+L">Liang Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18624v1-abstract-short" style="display: inline;"> Given a single in-the-wild human photo, it remains a challenging task to reconstruct a high-fidelity 3D human model. Existing methods face difficulties including a) the varying body proportions captured by in-the-wild human images; b) diverse personal belongings within the shot; and c) ambiguities in human postures and inconsistency in human textures. In addition, the scarcity of high-quality huma&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18624v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18624v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18624v1-abstract-full" style="display: none;"> Given a single in-the-wild human photo, it remains a challenging task to reconstruct a high-fidelity 3D human model. Existing methods face difficulties including a) the varying body proportions captured by in-the-wild human images; b) diverse personal belongings within the shot; and c) ambiguities in human postures and inconsistency in human textures. In addition, the scarcity of high-quality human data intensifies the challenge. To address these problems, we propose a Generalizable image-to-3D huMAN reconstruction framework, dubbed GeneMAN, building upon a comprehensive multi-source collection of high-quality human data, including 3D scans, multi-view videos, single photos, and our generated synthetic human data. GeneMAN encompasses three key modules. 1) Without relying on parametric human models (e.g., SMPL), GeneMAN first trains a human-specific text-to-image diffusion model and a view-conditioned diffusion model, serving as GeneMAN 2D human prior and 3D human prior for reconstruction, respectively. 2) With the help of the pretrained human prior models, the Geometry Initialization-&amp;-Sculpting pipeline is leveraged to recover high-quality 3D human geometry given a single image. 3) To achieve high-fidelity 3D human textures, GeneMAN employs the Multi-Space Texture Refinement pipeline, consecutively refining textures in the latent and the pixel spaces. Extensive experimental results demonstrate that GeneMAN could generate high-quality 3D human models from a single image input, outperforming prior state-of-the-art methods. Notably, GeneMAN could reveal much better generalizability in dealing with in-the-wild images, often yielding high-quality 3D human models in natural poses with common items, regardless of the body proportions in the input images. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18624v1-abstract-full').style.display = 'none'; document.getElementById('2411.18624v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://roooooz.github.io/GeneMAN/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18612">arXiv:2411.18612</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.18612">pdf</a>, <a href="https://arxiv.org/format/2411.18612">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Robust Offline Reinforcement Learning with Linearly Structured $f$-Divergence Regularization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Tang%2C+C">Cheng Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhishuai Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+P">Pan Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18612v1-abstract-short" style="display: inline;"> The Distributionally Robust Markov Decision Process (DRMDP) is a popular framework for addressing dynamics shift in reinforcement learning by learning policies robust to the worst-case transition dynamics within a constrained set. However, solving its dual optimization oracle poses significant challenges, limiting theoretical analysis and computational efficiency. The recently proposed Robust Regu&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18612v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18612v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18612v1-abstract-full" style="display: none;"> The Distributionally Robust Markov Decision Process (DRMDP) is a popular framework for addressing dynamics shift in reinforcement learning by learning policies robust to the worst-case transition dynamics within a constrained set. However, solving its dual optimization oracle poses significant challenges, limiting theoretical analysis and computational efficiency. The recently proposed Robust Regularized Markov Decision Process (RRMDP) replaces the uncertainty set constraint with a regularization term on the value function, offering improved scalability and theoretical insights. Yet, existing RRMDP methods rely on unstructured regularization, often leading to overly conservative policies by considering transitions that are unrealistic. To address these issues, we propose a novel framework, the $d$-rectangular linear robust regularized Markov decision process ($d$-RRMDP), which introduces a linear latent structure into both transition kernels and regularization. For the offline RL setting, where an agent learns robust policies from a pre-collected dataset in the nominal environment, we develop a family of algorithms, Robust Regularized Pessimistic Value Iteration (R2PVI), employing linear function approximation and $f$-divergence based regularization terms on transition kernels. We provide instance-dependent upper bounds on the suboptimality gap of R2PVI policies, showing these bounds depend on how well the dataset covers state-action spaces visited by the optimal robust policy under robustly admissible transitions. This term is further shown to be fundamental to $d$-RRMDPs via information-theoretic lower bounds. Finally, numerical experiments validate that R2PVI learns robust policies and is computationally more efficient than methods for constrained DRMDPs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18612v1-abstract-full').style.display = 'none'; document.getElementById('2411.18612v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">52 pages, 3 figures, 2 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18343">arXiv:2411.18343</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.18343">pdf</a>, <a href="https://arxiv.org/format/2411.18343">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> FreqX: What neural networks learn is what network designers say </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zechen Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18343v1-abstract-short" style="display: inline;"> Personalized Federal learning(PFL) allows clients to cooperatively train a personalized model without disclosing their private dataset. However, PFL suffers from Non-IID, heterogeneous devices, lack of fairness, and unclear contribution which urgently need the interpretability of deep learning model to overcome these challenges. These challenges proposed new demands for interpretability. Low cost,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18343v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18343v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18343v1-abstract-full" style="display: none;"> Personalized Federal learning(PFL) allows clients to cooperatively train a personalized model without disclosing their private dataset. However, PFL suffers from Non-IID, heterogeneous devices, lack of fairness, and unclear contribution which urgently need the interpretability of deep learning model to overcome these challenges. These challenges proposed new demands for interpretability. Low cost, privacy, and detailed information. There is no current interpretability method satisfying them. In this paper, we propose a novel interpretability method \emph{FreqX} by introducing Signal Processing and Information Theory. Our experiments show that the explanation results of FreqX contain both attribution information and concept information. FreqX runs at least 10 times faster than the baselines which contain concept information. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18343v1-abstract-full').style.display = 'none'; document.getElementById('2411.18343v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18308">arXiv:2411.18308</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.18308">pdf</a>, <a href="https://arxiv.org/format/2411.18308">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Performance">cs.PF</span> </div> </div> <p class="title is-5 mathjax"> CXL-Interference: Analysis and Characterization in Modern Computer Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Mao%2C+S">Shunyu Mao</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+J">Jiajun Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yixin Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+J">Jiapeng Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+W">Weidong Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zheng Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+T">Teng Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Deng%2C+S">Shuwen Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18308v1-abstract-short" style="display: inline;"> Compute Express Link (CXL) is a promising technology that addresses memory and storage challenges. Despite its advantages, CXL faces performance threats from external interference when co-existing with current memory and storage systems. This interference is under-explored in existing research. To address this, we develop CXL-Interplay, systematically characterizing and analyzing interference from&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18308v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18308v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18308v1-abstract-full" style="display: none;"> Compute Express Link (CXL) is a promising technology that addresses memory and storage challenges. Despite its advantages, CXL faces performance threats from external interference when co-existing with current memory and storage systems. This interference is under-explored in existing research. To address this, we develop CXL-Interplay, systematically characterizing and analyzing interference from memory and storage systems. To the best of our knowledge, we are the first to characterize CXL interference on real CXL hardware. We also provide reverse-reasoning analysis with performance counters and kernel functions. In the end, we propose and evaluate mitigating solutions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18308v1-abstract-full').style.display = 'none'; document.getElementById('2411.18308v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18009">arXiv:2411.18009</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.18009">pdf</a>, <a href="https://arxiv.org/format/2411.18009">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Monocular Obstacle Avoidance Based on Inverse PPO for Fixed-wing UAVs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chai%2C+H">Haochen Chai</a>, <a href="/search/cs?searchtype=author&amp;query=Su%2C+M">Meimei Su</a>, <a href="/search/cs?searchtype=author&amp;query=Lyu%2C+Y">Yang Lyu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhunga Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+C">Chunhui Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+Q">Quan Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18009v1-abstract-short" style="display: inline;"> Fixed-wing Unmanned Aerial Vehicles (UAVs) are one of the most commonly used platforms for the burgeoning Low-altitude Economy (LAE) and Urban Air Mobility (UAM), due to their long endurance and high-speed capabilities. Classical obstacle avoidance systems, which rely on prior maps or sophisticated sensors, face limitations in unknown low-altitude environments and small UAV platforms. In response,&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18009v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18009v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18009v1-abstract-full" style="display: none;"> Fixed-wing Unmanned Aerial Vehicles (UAVs) are one of the most commonly used platforms for the burgeoning Low-altitude Economy (LAE) and Urban Air Mobility (UAM), due to their long endurance and high-speed capabilities. Classical obstacle avoidance systems, which rely on prior maps or sophisticated sensors, face limitations in unknown low-altitude environments and small UAV platforms. In response, this paper proposes a lightweight deep reinforcement learning (DRL) based UAV collision avoidance system that enables a fixed-wing UAV to avoid unknown obstacles at cruise speed over 30m/s, with only onboard visual sensors. The proposed system employs a single-frame image depth inference module with a streamlined network architecture to ensure real-time obstacle detection, optimized for edge computing devices. After that, a reinforcement learning controller with a novel reward function is designed to balance the target approach and flight trajectory smoothness, satisfying the specific dynamic constraints and stability requirements of a fixed-wing UAV platform. An adaptive entropy adjustment mechanism is introduced to mitigate the exploration-exploitation trade-off inherent in DRL, improving training convergence and obstacle avoidance success rates. Extensive software-in-the-loop and hardware-in-the-loop experiments demonstrate that the proposed framework outperforms other methods in obstacle avoidance efficiency and flight trajectory smoothness and confirm the feasibility of implementing the algorithm on edge devices. The source code is publicly available at \url{https://github.com/ch9397/FixedWing-MonoPPO}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18009v1-abstract-full').style.display = 'none'; document.getElementById('2411.18009v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17837">arXiv:2411.17837</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.17837">pdf</a>, <a href="https://arxiv.org/format/2411.17837">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> OracleSage: Towards Unified Visual-Linguistic Understanding of Oracle Bone Scripts through Cross-Modal Knowledge Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+H">Hanqi Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+Y">Yi Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J">Junhao Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhengliang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+Y">Yifan Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Shu%2C+P">Peng Shu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yiwei Li</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+H">Huaqin Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Mihm%2C+S">Stephen Mihm</a>, <a href="/search/cs?searchtype=author&amp;query=Howe%2C+L+C">Lewis C Howe</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+T">Tianming Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17837v1-abstract-short" style="display: inline;"> Oracle bone script (OBS), as China&#39;s earliest mature writing system, present significant challenges in automatic recognition due to their complex pictographic structures and divergence from modern Chinese characters. We introduce OracleSage, a novel cross-modal framework that integrates hierarchical visual understanding with graph-based semantic reasoning. Specifically, we propose (1) a Hierarchic&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17837v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17837v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17837v1-abstract-full" style="display: none;"> Oracle bone script (OBS), as China&#39;s earliest mature writing system, present significant challenges in automatic recognition due to their complex pictographic structures and divergence from modern Chinese characters. We introduce OracleSage, a novel cross-modal framework that integrates hierarchical visual understanding with graph-based semantic reasoning. Specifically, we propose (1) a Hierarchical Visual-Semantic Understanding module that enables multi-granularity feature extraction through progressive fine-tuning of LLaVA&#39;s visual backbone, (2) a Graph-based Semantic Reasoning Framework that captures relationships between visual components and semantic concepts through dynamic message passing, and (3) OracleSem, a semantically enriched OBS dataset with comprehensive pictographic and semantic annotations. Experimental results demonstrate that OracleSage significantly outperforms state-of-the-art vision-language models. This research establishes a new paradigm for ancient text interpretation while providing valuable technical support for archaeological studies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17837v1-abstract-full').style.display = 'none'; document.getElementById('2411.17837v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17713">arXiv:2411.17713</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.17713">pdf</a>, <a href="https://arxiv.org/format/2411.17713">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Llama Guard 3-1B-INT4: Compact and Efficient Safeguard for Human-AI Conversations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Fedorov%2C+I">Igor Fedorov</a>, <a href="/search/cs?searchtype=author&amp;query=Plawiak%2C+K">Kate Plawiak</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+L">Lemeng Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Elgamal%2C+T">Tarek Elgamal</a>, <a href="/search/cs?searchtype=author&amp;query=Suda%2C+N">Naveen Suda</a>, <a href="/search/cs?searchtype=author&amp;query=Smith%2C+E">Eric Smith</a>, <a href="/search/cs?searchtype=author&amp;query=Zhan%2C+H">Hongyuan Zhan</a>, <a href="/search/cs?searchtype=author&amp;query=Chi%2C+J">Jianfeng Chi</a>, <a href="/search/cs?searchtype=author&amp;query=Hulovatyy%2C+Y">Yuriy Hulovatyy</a>, <a href="/search/cs?searchtype=author&amp;query=Patel%2C+K">Kimish Patel</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zechun Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+C">Changsheng Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Shi%2C+Y">Yangyang Shi</a>, <a href="/search/cs?searchtype=author&amp;query=Blankevoort%2C+T">Tijmen Blankevoort</a>, <a href="/search/cs?searchtype=author&amp;query=Pasupuleti%2C+M">Mahesh Pasupuleti</a>, <a href="/search/cs?searchtype=author&amp;query=Soran%2C+B">Bilge Soran</a>, <a href="/search/cs?searchtype=author&amp;query=Coudert%2C+Z+D">Zacharie Delpierre Coudert</a>, <a href="/search/cs?searchtype=author&amp;query=Alao%2C+R">Rachad Alao</a>, <a href="/search/cs?searchtype=author&amp;query=Krishnamoorthi%2C+R">Raghuraman Krishnamoorthi</a>, <a href="/search/cs?searchtype=author&amp;query=Chandra%2C+V">Vikas Chandra</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17713v1-abstract-short" style="display: inline;"> This paper presents Llama Guard 3-1B-INT4, a compact and efficient Llama Guard model, which has been open-sourced to the community during Meta Connect 2024. We demonstrate that Llama Guard 3-1B-INT4 can be deployed on resource-constrained devices, achieving a throughput of at least 30 tokens per second and a time-to-first-token of 2.5 seconds or less on a commodity Android mobile CPU. Notably, our&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17713v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17713v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17713v1-abstract-full" style="display: none;"> This paper presents Llama Guard 3-1B-INT4, a compact and efficient Llama Guard model, which has been open-sourced to the community during Meta Connect 2024. We demonstrate that Llama Guard 3-1B-INT4 can be deployed on resource-constrained devices, achieving a throughput of at least 30 tokens per second and a time-to-first-token of 2.5 seconds or less on a commodity Android mobile CPU. Notably, our experiments show that Llama Guard 3-1B-INT4 attains comparable or superior safety moderation scores to its larger counterpart, Llama Guard 3-1B, despite being approximately 7 times smaller in size (440MB). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17713v1-abstract-full').style.display = 'none'; document.getElementById('2411.17713v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17404">arXiv:2411.17404</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.17404">pdf</a>, <a href="https://arxiv.org/format/2411.17404">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> BPP-Search: Enhancing Tree of Thought Reasoning for Mathematical Modeling Problem Solving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Wang%2C+T">Teng Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+W">Wing-Yin Yu</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+Z">Zhenqi He</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zehua Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+X">Xiongwei Han</a>, <a href="/search/cs?searchtype=author&amp;query=Gong%2C+H">Hailei Gong</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+H">Han Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Shi%2C+W">Wei Shi</a>, <a href="/search/cs?searchtype=author&amp;query=She%2C+R">Ruifeng She</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+F">Fangzhou Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhong%2C+T">Tao Zhong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17404v1-abstract-short" style="display: inline;"> LLMs exhibit advanced reasoning capabilities, offering the potential to transform natural language questions into mathematical models. However, existing open-source operations research datasets lack detailed annotations of the modeling process, such as variable definitions, focusing solely on objective values, which hinders reinforcement learning applications. To address this, we release the Struc&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17404v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17404v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17404v1-abstract-full" style="display: none;"> LLMs exhibit advanced reasoning capabilities, offering the potential to transform natural language questions into mathematical models. However, existing open-source operations research datasets lack detailed annotations of the modeling process, such as variable definitions, focusing solely on objective values, which hinders reinforcement learning applications. To address this, we release the StructuredOR dataset, annotated with comprehensive labels that capture the complete mathematical modeling process. We further propose BPP-Search, a algorithm that integrates reinforcement learning into a tree-of-thought structure using Beam search, a Process reward model, and a pairwise Preference algorithm. This approach enables efficient exploration of tree structures, avoiding exhaustive search while improving accuracy. Extensive experiments on StructuredOR, NL4OPT, and MAMO-ComplexLP datasets show that BPP-Search significantly outperforms state-of-the-art methods, including Chain-of-Thought, Self-Consistency, and Tree-of-Thought. In tree-based reasoning, BPP-Search also surpasses Process Reward Model combined with Greedy or Beam Search, demonstrating superior accuracy and efficiency, and enabling faster retrieval of correct solutions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17404v1-abstract-full').style.display = 'none'; document.getElementById('2411.17404v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17188">arXiv:2411.17188</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.17188">pdf</a>, <a href="https://arxiv.org/format/2411.17188">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Interleaved Scene Graph for Interleaved Text-and-Image Generation Assessment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Chen%2C+D">Dongping Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+R">Ruoxi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Pu%2C+S">Shu Pu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhaoyi Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Y">Yanru Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+C">Caixi Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+B">Benlin Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Y">Yue Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Y">Yao Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+P">Pan Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Krishna%2C+R">Ranjay Krishna</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17188v1-abstract-short" style="display: inline;"> Many real-world user queries (e.g. &#34;How do to make egg fried rice?&#34;) could benefit from systems capable of generating responses with both textual steps with accompanying images, similar to a cookbook. Models designed to generate interleaved text and images face challenges in ensuring consistency within and across these modalities. To address these challenges, we present ISG, a comprehensive evalua&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17188v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17188v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17188v1-abstract-full" style="display: none;"> Many real-world user queries (e.g. &#34;How do to make egg fried rice?&#34;) could benefit from systems capable of generating responses with both textual steps with accompanying images, similar to a cookbook. Models designed to generate interleaved text and images face challenges in ensuring consistency within and across these modalities. To address these challenges, we present ISG, a comprehensive evaluation framework for interleaved text-and-image generation. ISG leverages a scene graph structure to capture relationships between text and image blocks, evaluating responses on four levels of granularity: holistic, structural, block-level, and image-specific. This multi-tiered evaluation allows for a nuanced assessment of consistency, coherence, and accuracy, and provides interpretable question-answer feedback. In conjunction with ISG, we introduce a benchmark, ISG-Bench, encompassing 1,150 samples across 8 categories and 21 subcategories. This benchmark dataset includes complex language-vision dependencies and golden answers to evaluate models effectively on vision-centric tasks such as style transfer, a challenging area for current models. Using ISG-Bench, we demonstrate that recent unified vision-language models perform poorly on generating interleaved content. While compositional approaches that combine separate language and image models show a 111% improvement over unified models at the holistic level, their performance remains suboptimal at both block and image levels. To facilitate future work, we develop ISG-Agent, a baseline agent employing a &#34;plan-execute-refine&#34; pipeline to invoke tools, achieving a 122% performance improvement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17188v1-abstract-full').style.display = 'none'; document.getElementById('2411.17188v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17137">arXiv:2411.17137</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.17137">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Self-reconfiguration Strategies for Space-distributed Spacecraft </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+T">Tianle Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zhixiang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yongwei Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Ziwei Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zihao Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yizhai Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+P">Panfeng Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17137v1-abstract-short" style="display: inline;"> This paper proposes a distributed on-orbit spacecraft assembly algorithm, where future spacecraft can assemble modules with different functions on orbit to form a spacecraft structure with specific functions. This form of spacecraft organization has the advantages of reconfigurability, fast mission response and easy maintenance. Reasonable and efficient on-orbit self-reconfiguration algorithms pla&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17137v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17137v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17137v1-abstract-full" style="display: none;"> This paper proposes a distributed on-orbit spacecraft assembly algorithm, where future spacecraft can assemble modules with different functions on orbit to form a spacecraft structure with specific functions. This form of spacecraft organization has the advantages of reconfigurability, fast mission response and easy maintenance. Reasonable and efficient on-orbit self-reconfiguration algorithms play a crucial role in realizing the benefits of distributed spacecraft. This paper adopts the framework of imitation learning combined with reinforcement learning for strategy learning of module handling order. A robot arm motion algorithm is then designed to execute the handling sequence. We achieve the self-reconfiguration handling task by creating a map on the surface of the module, completing the path point planning of the robotic arm using A*. The joint planning of the robotic arm is then accomplished through forward and reverse kinematics. Finally, the results are presented in Unity3D. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17137v1-abstract-full').style.display = 'none'; document.getElementById('2411.17137v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16804">arXiv:2411.16804</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.16804">pdf</a>, <a href="https://arxiv.org/format/2411.16804">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> InTraGen: Trajectory-controlled Video Generation for Object Interactions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zuhao Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Yanev%2C+A">Aleksandar Yanev</a>, <a href="/search/cs?searchtype=author&amp;query=Mahmood%2C+A">Ahmad Mahmood</a>, <a href="/search/cs?searchtype=author&amp;query=Nikolov%2C+I">Ivan Nikolov</a>, <a href="/search/cs?searchtype=author&amp;query=Motamed%2C+S">Saman Motamed</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+W">Wei-Shi Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xi Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Van+Gool%2C+L">Luc Van Gool</a>, <a href="/search/cs?searchtype=author&amp;query=Paudel%2C+D+P">Danda Pani Paudel</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16804v1-abstract-short" style="display: inline;"> Advances in video generation have significantly improved the realism and quality of created scenes. This has fueled interest in developing intuitive tools that let users leverage video generation as world simulators. Text-to-video (T2V) generation is one such approach, enabling video creation from text descriptions only. Yet, due to the inherent ambiguity in texts and the limited temporal informat&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16804v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16804v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16804v1-abstract-full" style="display: none;"> Advances in video generation have significantly improved the realism and quality of created scenes. This has fueled interest in developing intuitive tools that let users leverage video generation as world simulators. Text-to-video (T2V) generation is one such approach, enabling video creation from text descriptions only. Yet, due to the inherent ambiguity in texts and the limited temporal information offered by text prompts, researchers have explored additional control signals like trajectory-guided systems, for more accurate T2V generation. Nonetheless, methods to evaluate whether T2V models can generate realistic interactions between multiple objects are lacking. We introduce InTraGen, a pipeline for improved trajectory-based generation of object interaction scenarios. We propose 4 new datasets and a novel trajectory quality metric to evaluate the performance of the proposed InTraGen. To achieve object interaction, we introduce a multi-modal interaction encoding pipeline with an object ID injection mechanism that enriches object-environment interactions. Our results demonstrate improvements in both visual fidelity and quantitative performance. Code and datasets are available at https://github.com/insait-institute/InTraGen <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16804v1-abstract-full').style.display = 'none'; document.getElementById('2411.16804v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16713">arXiv:2411.16713</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.16713">pdf</a>, <a href="https://arxiv.org/format/2411.16713">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Conditional Text-to-Image Generation with Reference Guidance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kim%2C+T">Taewook Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Ze Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Z">Zhengyuan Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jiang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Lijuan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zicheng Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Qiu%2C+Q">Qiang Qiu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16713v1-abstract-short" style="display: inline;"> Text-to-image diffusion models have demonstrated tremendous success in synthesizing visually stunning images given textual instructions. Despite remarkable progress in creating high-fidelity visuals, text-to-image models can still struggle with precisely rendering subjects, such as text spelling. To address this challenge, this paper explores using additional conditions of an image that provides v&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16713v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16713v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16713v1-abstract-full" style="display: none;"> Text-to-image diffusion models have demonstrated tremendous success in synthesizing visually stunning images given textual instructions. Despite remarkable progress in creating high-fidelity visuals, text-to-image models can still struggle with precisely rendering subjects, such as text spelling. To address this challenge, this paper explores using additional conditions of an image that provides visual guidance of the particular subjects for diffusion models to generate. In addition, this reference condition empowers the model to be conditioned in ways that the vocabularies of the text tokenizer cannot adequately represent, and further extends the model&#39;s generalization to novel capabilities such as generating non-English text spellings. We develop several small-scale expert plugins that efficiently endow a Stable Diffusion model with the capability to take different references. Each plugin is trained with auxiliary networks and loss functions customized for applications such as English scene-text generation, multi-lingual scene-text generation, and logo-image generation. Our expert plugins demonstrate superior results than the existing methods on all tasks, each containing only 28.55M trainable parameters. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16713v1-abstract-full').style.display = 'none'; document.getElementById('2411.16713v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15800">arXiv:2411.15800</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.15800">pdf</a>, <a href="https://arxiv.org/format/2411.15800">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PG-SLAM: Photo-realistic and Geometry-aware RGB-D SLAM in Dynamic Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+H">Haoang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Meng%2C+X">Xiangqi Meng</a>, <a href="/search/cs?searchtype=author&amp;query=Zuo%2C+X">Xingxing Zuo</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhe Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Hesheng Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Cremers%2C+D">Daniel Cremers</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15800v1-abstract-short" style="display: inline;"> Simultaneous localization and mapping (SLAM) has achieved impressive performance in static environments. However, SLAM in dynamic environments remains an open question. Many methods directly filter out dynamic objects, resulting in incomplete scene reconstruction and limited accuracy of camera localization. The other works express dynamic objects by point clouds, sparse joints, or coarse meshes, w&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15800v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15800v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15800v1-abstract-full" style="display: none;"> Simultaneous localization and mapping (SLAM) has achieved impressive performance in static environments. However, SLAM in dynamic environments remains an open question. Many methods directly filter out dynamic objects, resulting in incomplete scene reconstruction and limited accuracy of camera localization. The other works express dynamic objects by point clouds, sparse joints, or coarse meshes, which fails to provide a photo-realistic representation. To overcome the above limitations, we propose a photo-realistic and geometry-aware RGB-D SLAM method by extending Gaussian splatting. Our method is composed of three main modules to 1) map the dynamic foreground including non-rigid humans and rigid items, 2) reconstruct the static background, and 3) localize the camera. To map the foreground, we focus on modeling the deformations and/or motions. We consider the shape priors of humans and exploit geometric and appearance constraints of humans and items. For background mapping, we design an optimization strategy between neighboring local maps by integrating appearance constraint into geometric alignment. As to camera localization, we leverage both static background and dynamic foreground to increase the observations for noise compensation. We explore the geometric and appearance constraints by associating 3D Gaussians with 2D optical flows and pixel patches. Experiments on various real-world datasets demonstrate that our method outperforms state-of-the-art approaches in terms of camera localization and scene representation. Source codes will be publicly available upon paper acceptance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15800v1-abstract-full').style.display = 'none'; document.getElementById('2411.15800v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15595">arXiv:2411.15595</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.15595">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> An adversarial feature learning based semantic communication method for Human 3D Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+S">Shaojiang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zou%2C+J">Jiajun Zou</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhendan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+M">Meixia Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+Z">Zhiping Wan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15595v1-abstract-short" style="display: inline;"> With the widespread application of human body 3D reconstruction technology across various fields, the demands for data transmission and processing efficiency continue to rise, particularly in scenarios where network bandwidth is limited and low latency is required. This paper introduces an Adversarial Feature Learning-based Semantic Communication method (AFLSC) for human body 3D reconstruction, wh&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15595v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15595v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15595v1-abstract-full" style="display: none;"> With the widespread application of human body 3D reconstruction technology across various fields, the demands for data transmission and processing efficiency continue to rise, particularly in scenarios where network bandwidth is limited and low latency is required. This paper introduces an Adversarial Feature Learning-based Semantic Communication method (AFLSC) for human body 3D reconstruction, which focuses on extracting and transmitting semantic information crucial for the 3D reconstruction task, thereby significantly optimizing data flow and alleviating bandwidth pressure. At the sender&#39;s end, we propose a multitask learning-based feature extraction method to capture the spatial layout, keypoints, posture, and depth information from 2D human images, and design a semantic encoding technique based on adversarial feature learning to encode these feature information into semantic data. We also develop a dynamic compression technique to efficiently transmit this semantic data, greatly enhancing transmission efficiency and reducing latency. At the receiver&#39;s end, we design an efficient multi-level semantic feature decoding method to convert semantic data back into key image features. Finally, an improved ViT-diffusion model is employed for 3D reconstruction, producing human body 3D mesh models. Experimental results validate the advantages of our method in terms of data transmission efficiency and reconstruction quality, demonstrating its excellent potential for application in bandwidth-limited environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15595v1-abstract-full').style.display = 'none'; document.getElementById('2411.15595v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15450">arXiv:2411.15450</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.15450">pdf</a>, <a href="https://arxiv.org/format/2411.15450">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Unveiling the Achilles&#39; Heel: Backdoor Watermarking Forgery Attack in Public Dataset Protection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zhiying Li</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhi Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+D">Dongjie Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhuo%2C+S">Shengda Zhuo</a>, <a href="/search/cs?searchtype=author&amp;query=Geng%2C+G">Guanggang Geng</a>, <a href="/search/cs?searchtype=author&amp;query=Weng%2C+J">Jian Weng</a>, <a href="/search/cs?searchtype=author&amp;query=Lyu%2C+S">Shanxiang Lyu</a>, <a href="/search/cs?searchtype=author&amp;query=Jin%2C+X">Xiaobo Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15450v1-abstract-short" style="display: inline;"> High-quality datasets can greatly promote the development of technology. However, dataset construction is expensive and time-consuming, and public datasets are easily exploited by opportunists who are greedy for quick gains, which seriously infringes the rights and interests of dataset owners. At present, backdoor watermarks redefine dataset protection as proof of ownership and become a popular me&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15450v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15450v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15450v1-abstract-full" style="display: none;"> High-quality datasets can greatly promote the development of technology. However, dataset construction is expensive and time-consuming, and public datasets are easily exploited by opportunists who are greedy for quick gains, which seriously infringes the rights and interests of dataset owners. At present, backdoor watermarks redefine dataset protection as proof of ownership and become a popular method to protect the copyright of public datasets, which effectively safeguards the rights of owners and promotes the development of open source communities. In this paper, we question the reliability of backdoor watermarks and re-examine them from the perspective of attackers. On the one hand, we refine the process of backdoor watermarks by introducing a third-party judicial agency to enhance its practical applicability in real-world scenarios. On the other hand, by exploring the problem of forgery attacks, we reveal the inherent flaws of the dataset ownership verification process. Specifically, we design a Forgery Watermark Generator (FW-Gen) to generate forged watermarks and define a distillation loss between the original watermark and the forged watermark to transfer the information in the original watermark to the forged watermark. Extensive experiments show that forged watermarks have the same statistical significance as original watermarks in copyright verification tests under various conditions and scenarios, indicating that dataset ownership verification results are insufficient to determine infringement. These findings highlight the unreliability of backdoor watermarking methods for dataset ownership verification and suggest new directions for enhancing methods for protecting public datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15450v1-abstract-full').style.display = 'none'; document.getElementById('2411.15450v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15439">arXiv:2411.15439</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.15439">pdf</a>, <a href="https://arxiv.org/format/2411.15439">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Twin Trigger Generative Networks for Backdoor Attacks against Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zhiying Li</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhi Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Geng%2C+G">Guanggang Geng</a>, <a href="/search/cs?searchtype=author&amp;query=Gowda%2C+S+N">Shreyank N Gowda</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+S">Shuyuan Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Weng%2C+J">Jian Weng</a>, <a href="/search/cs?searchtype=author&amp;query=Jin%2C+X">Xiaobo Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15439v1-abstract-short" style="display: inline;"> Object detectors, which are widely used in real-world applications, are vulnerable to backdoor attacks. This vulnerability arises because many users rely on datasets or pre-trained models provided by third parties due to constraints on data and resources. However, most research on backdoor attacks has focused on image classification, with limited investigation into object detection. Furthermore, t&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15439v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15439v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15439v1-abstract-full" style="display: none;"> Object detectors, which are widely used in real-world applications, are vulnerable to backdoor attacks. This vulnerability arises because many users rely on datasets or pre-trained models provided by third parties due to constraints on data and resources. However, most research on backdoor attacks has focused on image classification, with limited investigation into object detection. Furthermore, the triggers for most existing backdoor attacks on object detection are manually generated, requiring prior knowledge and consistent patterns between the training and inference stages. This approach makes the attacks either easy to detect or difficult to adapt to various scenarios. To address these limitations, we propose novel twin trigger generative networks in the frequency domain to generate invisible triggers for implanting stealthy backdoors into models during training, and visible triggers for steady activation during inference, making the attack process difficult to trace. Specifically, for the invisible trigger generative network, we deploy a Gaussian smoothing layer and a high-frequency artifact classifier to enhance the stealthiness of backdoor implantation in object detectors. For the visible trigger generative network, we design a novel alignment loss to optimize the visible triggers so that they differ from the original patterns but still align with the malicious activation behavior of the invisible triggers. Extensive experimental results and analyses prove the possibility of using different triggers in the training stage and the inference stage, and demonstrate the attack effectiveness of our proposed visible trigger and invisible trigger generative networks, significantly reducing the mAP_0.5 of the object detectors by 70.0% and 84.5%, including YOLOv5 and YOLOv7 with different settings, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15439v1-abstract-full').style.display = 'none'; document.getElementById('2411.15439v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15296">arXiv:2411.15296</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.15296">pdf</a>, <a href="https://arxiv.org/format/2411.15296">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> MME-Survey: A Comprehensive Survey on Evaluation of Multimodal LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Fu%2C+C">Chaoyou Fu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yi-Fan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+S">Shukang Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+B">Bo Li</a>, <a href="/search/cs?searchtype=author&amp;query=Fang%2C+X">Xinyu Fang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+S">Sirui Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Duan%2C+H">Haodong Duan</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+X">Xing Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Ziwei Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Liang Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Shan%2C+C">Caifeng Shan</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+R">Ran He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15296v1-abstract-short" style="display: inline;"> As a prominent direction of Artificial General Intelligence (AGI), Multimodal Large Language Models (MLLMs) have garnered increased attention from both industry and academia. Building upon pre-trained LLMs, this family of models further develops multimodal perception and reasoning capabilities that are impressive, such as writing code given a flow chart or creating stories based on an image. In th&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15296v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15296v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15296v1-abstract-full" style="display: none;"> As a prominent direction of Artificial General Intelligence (AGI), Multimodal Large Language Models (MLLMs) have garnered increased attention from both industry and academia. Building upon pre-trained LLMs, this family of models further develops multimodal perception and reasoning capabilities that are impressive, such as writing code given a flow chart or creating stories based on an image. In the development process, evaluation is critical since it provides intuitive feedback and guidance on improving models. Distinct from the traditional train-eval-test paradigm that only favors a single task like image classification, the versatility of MLLMs has spurred the rise of various new benchmarks and evaluation methods. In this paper, we aim to present a comprehensive survey of MLLM evaluation, discussing four key aspects: 1) the summarised benchmarks types divided by the evaluation capabilities, including foundation capabilities, model self-analysis, and extented applications; 2) the typical process of benchmark counstruction, consisting of data collection, annotation, and precautions; 3) the systematic evaluation manner composed of judge, metric, and toolkit; 4) the outlook for the next benchmark. This work aims to offer researchers an easy grasp of how to effectively evaluate MLLMs according to different needs and to inspire better evaluation methods, thereby driving the progress of MLLM research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15296v1-abstract-full').style.display = 'none'; document.getElementById('2411.15296v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Produced by MME+MMBench+LLaVA Teams. Project Page: https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/tree/Benchmarks</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15138">arXiv:2411.15138</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.15138">pdf</a>, <a href="https://arxiv.org/format/2411.15138">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Material Anything: Generating Materials for Any 3D Object via Diffusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Huang%2C+X">Xin Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+T">Tengfei Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Ziwei Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Q">Qing Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15138v1-abstract-short" style="display: inline;"> We present Material Anything, a fully-automated, unified diffusion framework designed to generate physically-based materials for 3D objects. Unlike existing methods that rely on complex pipelines or case-specific optimizations, Material Anything offers a robust, end-to-end solution adaptable to objects under diverse lighting conditions. Our approach leverages a pre-trained image diffusion model, e&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15138v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15138v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15138v1-abstract-full" style="display: none;"> We present Material Anything, a fully-automated, unified diffusion framework designed to generate physically-based materials for 3D objects. Unlike existing methods that rely on complex pipelines or case-specific optimizations, Material Anything offers a robust, end-to-end solution adaptable to objects under diverse lighting conditions. Our approach leverages a pre-trained image diffusion model, enhanced with a triple-head architecture and rendering loss to improve stability and material quality. Additionally, we introduce confidence masks as a dynamic switcher within the diffusion model, enabling it to effectively handle both textured and texture-less objects across varying lighting conditions. By employing a progressive material generation strategy guided by these confidence masks, along with a UV-space material refiner, our method ensures consistent, UV-ready material outputs. Extensive experiments demonstrate our approach outperforms existing methods across a wide range of object categories and lighting conditions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15138v1-abstract-full').style.display = 'none'; document.getElementById('2411.15138v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://xhuangcv.github.io/MaterialAnything/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15056">arXiv:2411.15056</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.15056">pdf</a>, <a href="https://arxiv.org/format/2411.15056">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Financial Risk Assessment via Long-term Payment Behavior Sequence Folding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Qiao%2C+Y">Yiran Qiao</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+Y">Yateng Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Ao%2C+X">Xiang Ao</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+Q">Qi Yuan</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Ziming Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+C">Chen Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+X">Xuehao Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15056v1-abstract-short" style="display: inline;"> Online inclusive financial services encounter significant financial risks due to their expansive user base and low default costs. By real-world practice, we reveal that utilizing longer-term user payment behaviors can enhance models&#39; ability to forecast financial risks. However, learning long behavior sequences is non-trivial for deep sequential models. Additionally, the diverse fields of payment&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15056v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15056v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15056v1-abstract-full" style="display: none;"> Online inclusive financial services encounter significant financial risks due to their expansive user base and low default costs. By real-world practice, we reveal that utilizing longer-term user payment behaviors can enhance models&#39; ability to forecast financial risks. However, learning long behavior sequences is non-trivial for deep sequential models. Additionally, the diverse fields of payment behaviors carry rich information, requiring thorough exploitation. These factors collectively complicate the task of long-term user behavior modeling. To tackle these challenges, we propose a Long-term Payment Behavior Sequence Folding method, referred to as LBSF. In LBSF, payment behavior sequences are folded based on merchants, using the merchant field as an intrinsic grouping criterion, which enables informative parallelism without reliance on external knowledge. Meanwhile, we maximize the utility of payment details through a multi-field behavior encoding mechanism. Subsequently, behavior aggregation at the merchant level followed by relational learning across merchants facilitates comprehensive user financial representation. We evaluate LBSF on the financial risk assessment task using a large-scale real-world dataset. The results demonstrate that folding long behavior sequences based on internal behavioral cues effectively models long-term patterns and changes, thereby generating more accurate user financial profiles for practical applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15056v1-abstract-full').style.display = 'none'; document.getElementById('2411.15056v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICDM2024 long paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14982">arXiv:2411.14982</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.14982">pdf</a>, <a href="https://arxiv.org/format/2411.14982">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Large Multi-modal Models Can Interpret Features in Large Multi-modal Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+K">Kaichen Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Shen%2C+Y">Yifei Shen</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+B">Bo Li</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Ziwei Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14982v1-abstract-short" style="display: inline;"> Recent advances in Large Multimodal Models (LMMs) lead to significant breakthroughs in both academia and industry. One question that arises is how we, as humans, can understand their internal neural representations. This paper takes an initial step towards addressing this question by presenting a versatile framework to identify and interpret the semantics within LMMs. Specifically, 1) we first app&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14982v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14982v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14982v1-abstract-full" style="display: none;"> Recent advances in Large Multimodal Models (LMMs) lead to significant breakthroughs in both academia and industry. One question that arises is how we, as humans, can understand their internal neural representations. This paper takes an initial step towards addressing this question by presenting a versatile framework to identify and interpret the semantics within LMMs. Specifically, 1) we first apply a Sparse Autoencoder(SAE) to disentangle the representations into human understandable features. 2) We then present an automatic interpretation framework to interpreted the open-semantic features learned in SAE by the LMMs themselves. We employ this framework to analyze the LLaVA-NeXT-8B model using the LLaVA-OV-72B model, demonstrating that these features can effectively steer the model&#39;s behavior. Our results contribute to a deeper understanding of why LMMs excel in specific tasks, including EQ tests, and illuminate the nature of their mistakes along with potential strategies for their rectification. These findings offer new insights into the internal mechanisms of LMMs and suggest parallels with the cognitive processes of the human brain. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14982v1-abstract-full').style.display = 'none'; document.getElementById('2411.14982v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14951">arXiv:2411.14951</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.14951">pdf</a>, <a href="https://arxiv.org/format/2411.14951">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Morph: A Motion-free Physics Optimization Framework for Human Motion Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zhuo Li</a>, <a href="/search/cs?searchtype=author&amp;query=Luo%2C+M">Mingshuang Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Hou%2C+R">Ruibing Hou</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+X">Xin Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+H">Hao Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Chang%2C+H">Hong Chang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zimo Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chen Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14951v1-abstract-short" style="display: inline;"> Human motion generation plays a vital role in applications such as digital humans and humanoid robot control. However, most existing approaches disregard physics constraints, leading to the frequent production of physically implausible motions with pronounced artifacts such as floating and foot sliding. In this paper, we propose \textbf{Morph}, a \textbf{Mo}tion-f\textbf{r}ee \textbf{ph}ysics opti&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14951v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14951v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14951v1-abstract-full" style="display: none;"> Human motion generation plays a vital role in applications such as digital humans and humanoid robot control. However, most existing approaches disregard physics constraints, leading to the frequent production of physically implausible motions with pronounced artifacts such as floating and foot sliding. In this paper, we propose \textbf{Morph}, a \textbf{Mo}tion-f\textbf{r}ee \textbf{ph}ysics optimization framework, comprising a Motion Generator and a Motion Physics Refinement module, for enhancing physical plausibility without relying on costly real-world motion data. Specifically, the Motion Generator is responsible for providing large-scale synthetic motion data, while the Motion Physics Refinement Module utilizes these synthetic data to train a motion imitator within a physics simulator, enforcing physical constraints to project the noisy motions into a physically-plausible space. These physically refined motions, in turn, are used to fine-tune the Motion Generator, further enhancing its capability. Experiments on both text-to-motion and music-to-dance generation tasks demonstrate that our framework achieves state-of-the-art motion generation quality while improving physical plausibility drastically. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14951v1-abstract-full').style.display = 'none'; document.getElementById('2411.14951v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14847">arXiv:2411.14847</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.14847">pdf</a>, <a href="https://arxiv.org/format/2411.14847">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Dynamics-Aware Gaussian Splatting Streaming Towards Fast On-the-Fly Training for 4D Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhening Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+Y">Yingdong Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xinjie Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Shao%2C+J">Jiawei Shao</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+Z">Zehong Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14847v1-abstract-short" style="display: inline;"> The recent development of 3D Gaussian Splatting (3DGS) has led to great interest in 4D dynamic spatial reconstruction from multi-view visual inputs. While existing approaches mainly rely on processing full-length multi-view videos for 4D reconstruction, there has been limited exploration of iterative online reconstruction methods that enable on-the-fly training and per-frame streaming. Current 3DG&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14847v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14847v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14847v1-abstract-full" style="display: none;"> The recent development of 3D Gaussian Splatting (3DGS) has led to great interest in 4D dynamic spatial reconstruction from multi-view visual inputs. While existing approaches mainly rely on processing full-length multi-view videos for 4D reconstruction, there has been limited exploration of iterative online reconstruction methods that enable on-the-fly training and per-frame streaming. Current 3DGS-based streaming methods treat the Gaussian primitives uniformly and constantly renew the densified Gaussians, thereby overlooking the difference between dynamic and static features and also neglecting the temporal continuity in the scene. To address these limitations, we propose a novel three-stage pipeline for iterative streamable 4D dynamic spatial reconstruction. Our pipeline comprises a selective inheritance stage to preserve temporal continuity, a dynamics-aware shift stage for distinguishing dynamic and static primitives and optimizing their movements, and an error-guided densification stage to accommodate emerging objects. Our method achieves state-of-the-art performance in online 4D reconstruction, demonstrating a 20% improvement in on-the-fly training speed, superior representation quality, and real-time rendering capability. Project page: https://www.liuzhening.top/DASS <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14847v1-abstract-full').style.display = 'none'; document.getElementById('2411.14847v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://www.liuzhening.top/DASS</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14795">arXiv:2411.14795</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.14795">pdf</a>, <a href="https://arxiv.org/format/2411.14795">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> De-biased Multimodal Electrocardiogram Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Li%2C+H">Haitao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Ziyu Li</a>, <a href="/search/cs?searchtype=author&amp;query=Mao%2C+Y">Yiheng Mao</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Ziyi Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+Z">Zhoujian Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Z">Zhengxing Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14795v1-abstract-short" style="display: inline;"> Multimodal large language models (MLLMs) are increasingly being applied in the medical field, particularly in medical imaging. However, developing MLLMs for ECG signals, which are crucial in clinical settings, has been a significant challenge beyond medical imaging. Previous studies have attempted to address this by converting ECGs into several text tags using an external classifier in a training-&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14795v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14795v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14795v1-abstract-full" style="display: none;"> Multimodal large language models (MLLMs) are increasingly being applied in the medical field, particularly in medical imaging. However, developing MLLMs for ECG signals, which are crucial in clinical settings, has been a significant challenge beyond medical imaging. Previous studies have attempted to address this by converting ECGs into several text tags using an external classifier in a training-free manner. However, this approach significantly compresses the information in ECGs and underutilizes the reasoning capabilities of LLMs. In this work, we directly feed the embeddings of ECGs into the LLM through a projection layer, retaining more information about ECGs and better leveraging the reasoning abilities of LLMs. Our method can also effectively handle a common situation in clinical practice where it is necessary to compare two ECGs taken at different times. Recent studies found that MLLMs may rely solely on text input to provide answers, ignoring inputs from other modalities. We analyzed this phenomenon from a causal perspective in the context of ECG MLLMs and discovered that the confounder, severity of illness, introduces a spurious correlation between the question and answer, leading the model to rely on this spurious correlation and ignore the ECG input. Such models do not comprehend the ECG input and perform poorly in adversarial tests where different expressions of the same question are used in the training and testing sets. We designed a de-biased pre-training method to eliminate the confounder&#39;s effect according to the theory of backdoor adjustment. Our model performed well on the ECG-QA task under adversarial testing and demonstrated zero-shot capabilities. An interesting random ECG test further validated that our model effectively understands and utilizes the input ECG signal. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14795v1-abstract-full').style.display = 'none'; document.getElementById('2411.14795v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14790">arXiv:2411.14790</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.14790">pdf</a>, <a href="https://arxiv.org/format/2411.14790">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> KBAlign: Efficient Self Adaptation on Specific Knowledge Bases </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zeng%2C+Z">Zheni Zeng</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yuxuan Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+S">Shi Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+Y">Yukun Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhenghao Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+S">Shuo Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+X">Xu Han</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhiyuan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+M">Maosong Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14790v2-abstract-short" style="display: inline;"> Humans can utilize techniques to quickly acquire knowledge from specific materials in advance, such as creating self-assessment questions, enabling us to achieving related tasks more efficiently. In contrast, large language models (LLMs) usually relies on retrieval-augmented generation to exploit knowledge materials in an instant manner, or requires external signals such as human preference data a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14790v2-abstract-full').style.display = 'inline'; document.getElementById('2411.14790v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14790v2-abstract-full" style="display: none;"> Humans can utilize techniques to quickly acquire knowledge from specific materials in advance, such as creating self-assessment questions, enabling us to achieving related tasks more efficiently. In contrast, large language models (LLMs) usually relies on retrieval-augmented generation to exploit knowledge materials in an instant manner, or requires external signals such as human preference data and stronger LLM annotations to conduct knowledge adaptation. To unleash the self-learning potential of LLMs, we propose KBAlign, an approach designed for efficient adaptation to downstream tasks involving knowledge bases. Our method utilizes iterative training with self-annotated data such as Q&amp;A pairs and revision suggestions, enabling the model to grasp the knowledge content efficiently. Experimental results on multiple datasets demonstrate the effectiveness of our approach, significantly boosting model performance in downstream tasks that require specific knowledge at a low cost. Notably, our approach achieves over 90% of the performance improvement that can be obtained by using GPT-4-turbo annotation, while relying entirely on self-supervision. We release our experimental data, models, and process analyses to the community for further exploration (https://github.com/thunlp/KBAlign). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14790v2-abstract-full').style.display = 'none'; document.getElementById('2411.14790v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14461">arXiv:2411.14461</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.14461">pdf</a>, <a href="https://arxiv.org/format/2411.14461">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Towards Next-Generation Medical Agent: How o1 is Reshaping Decision-Making in Medical Scenarios </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xu%2C+S">Shaochen Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+Y">Yifan Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhengliang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+Z">Zihao Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhong%2C+T">Tianyang Zhong</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+H">Huaqin Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yiwei Li</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+H">Hanqi Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+Y">Yi Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J">Junhao Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+J">Jin Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+W">Wei Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+T">Tuo Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Lu Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+D">Dajiang Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xiang Li</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+W">Wei Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Q">Quanzheng Li</a>, <a href="/search/cs?searchtype=author&amp;query=Sikora%2C+A">Andrea Sikora</a>, <a href="/search/cs?searchtype=author&amp;query=Zhai%2C+X">Xiaoming Zhai</a>, <a href="/search/cs?searchtype=author&amp;query=Xiang%2C+Z">Zhen Xiang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+T">Tianming Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14461v1-abstract-short" style="display: inline;"> Artificial Intelligence (AI) has become essential in modern healthcare, with large language models (LLMs) offering promising advances in clinical decision-making. Traditional model-based approaches, including those leveraging in-context demonstrations and those with specialized medical fine-tuning, have demonstrated strong performance in medical language processing but struggle with real-time adap&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14461v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14461v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14461v1-abstract-full" style="display: none;"> Artificial Intelligence (AI) has become essential in modern healthcare, with large language models (LLMs) offering promising advances in clinical decision-making. Traditional model-based approaches, including those leveraging in-context demonstrations and those with specialized medical fine-tuning, have demonstrated strong performance in medical language processing but struggle with real-time adaptability, multi-step reasoning, and handling complex medical tasks. Agent-based AI systems address these limitations by incorporating reasoning traces, tool selection based on context, knowledge retrieval, and both short- and long-term memory. These additional features enable the medical AI agent to handle complex medical scenarios where decision-making should be built on real-time interaction with the environment. Therefore, unlike conventional model-based approaches that treat medical queries as isolated questions, medical AI agents approach them as complex tasks and behave more like human doctors. In this paper, we study the choice of the backbone LLM for medical AI agents, which is the foundation for the agent&#39;s overall reasoning and action generation. In particular, we consider the emergent o1 model and examine its impact on agents&#39; reasoning, tool-use adaptability, and real-time information retrieval across diverse clinical scenarios, including high-stakes settings such as intensive care units (ICUs). Our findings demonstrate o1&#39;s ability to enhance diagnostic accuracy and consistency, paving the way for smarter, more responsive AI tools that support better patient outcomes and decision-making efficacy in clinical practice. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14461v1-abstract-full').style.display = 'none'; document.getElementById('2411.14461v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14432">arXiv:2411.14432</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.14432">pdf</a>, <a href="https://arxiv.org/format/2411.14432">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Insight-V: Exploring Long-Chain Visual Reasoning with Multimodal Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Dong%2C+Y">Yuhao Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zuyan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+H">Hai-Long Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+J">Jingkang Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+W">Winston Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Rao%2C+Y">Yongming Rao</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Ziwei Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14432v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) demonstrate enhanced capabilities and reliability by reasoning more, evolving from Chain-of-Thought prompting to product-level solutions like OpenAI o1. Despite various efforts to improve LLM reasoning, high-quality long-chain reasoning data and optimized training pipelines still remain inadequately explored in vision-language tasks. In this paper, we present Insight-V&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14432v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14432v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14432v1-abstract-full" style="display: none;"> Large Language Models (LLMs) demonstrate enhanced capabilities and reliability by reasoning more, evolving from Chain-of-Thought prompting to product-level solutions like OpenAI o1. Despite various efforts to improve LLM reasoning, high-quality long-chain reasoning data and optimized training pipelines still remain inadequately explored in vision-language tasks. In this paper, we present Insight-V, an early effort to 1) scalably produce long and robust reasoning data for complex multi-modal tasks, and 2) an effective training pipeline to enhance the reasoning capabilities of multi-modal large language models (MLLMs). Specifically, to create long and structured reasoning data without human labor, we design a two-step pipeline with a progressive strategy to generate sufficiently long and diverse reasoning paths and a multi-granularity assessment method to ensure data quality. We observe that directly supervising MLLMs with such long and complex reasoning data will not yield ideal reasoning ability. To tackle this problem, we design a multi-agent system consisting of a reasoning agent dedicated to performing long-chain reasoning and a summary agent trained to judge and summarize reasoning results. We further incorporate an iterative DPO algorithm to enhance the reasoning agent&#39;s generation stability and quality. Based on the popular LLaVA-NeXT model and our stronger base MLLM, we demonstrate significant performance gains across challenging multi-modal benchmarks requiring visual reasoning. Benefiting from our multi-agent system, Insight-V can also easily maintain or improve performance on perception-focused multi-modal tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14432v1-abstract-full').style.display = 'none'; document.getElementById('2411.14432v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14423">arXiv:2411.14423</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.14423">pdf</a>, <a href="https://arxiv.org/format/2411.14423">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Unleashing the Potential of Multi-modal Foundation Models and Video Diffusion for 4D Dynamic Physical Scene Simulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhuoman Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Ye%2C+W">Weicai Ye</a>, <a href="/search/cs?searchtype=author&amp;query=Luximon%2C+Y">Yan Luximon</a>, <a href="/search/cs?searchtype=author&amp;query=Wan%2C+P">Pengfei Wan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+D">Di Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14423v1-abstract-short" style="display: inline;"> Realistic simulation of dynamic scenes requires accurately capturing diverse material properties and modeling complex object interactions grounded in physical principles. However, existing methods are constrained to basic material types with limited predictable parameters, making them insufficient to represent the complexity of real-world materials. We introduce a novel approach that leverages mul&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14423v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14423v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14423v1-abstract-full" style="display: none;"> Realistic simulation of dynamic scenes requires accurately capturing diverse material properties and modeling complex object interactions grounded in physical principles. However, existing methods are constrained to basic material types with limited predictable parameters, making them insufficient to represent the complexity of real-world materials. We introduce a novel approach that leverages multi-modal foundation models and video diffusion to achieve enhanced 4D dynamic scene simulation. Our method utilizes multi-modal models to identify material types and initialize material parameters through image queries, while simultaneously inferring 3D Gaussian splats for detailed scene representation. We further refine these material parameters using video diffusion with a differentiable Material Point Method (MPM) and optical flow guidance rather than render loss or Score Distillation Sampling (SDS) loss. This integrated framework enables accurate prediction and realistic simulation of dynamic interactions in real-world scenarios, advancing both accuracy and flexibility in physics-based simulations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14423v1-abstract-full').style.display = 'none'; document.getElementById('2411.14423v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Homepage: https://zhuomanliu.github.io/PhysFlow/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14250">arXiv:2411.14250</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.14250">pdf</a>, <a href="https://arxiv.org/format/2411.14250">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CP-UNet: Contour-based Probabilistic Model for Medical Ultrasound Images Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Yu%2C+R">Ruiguo Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Yiyang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Tian%2C+Y">Yuan Tian</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhiqiang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+X">Xuewei Li</a>, <a href="/search/cs?searchtype=author&amp;query=Gao%2C+J">Jie Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14250v1-abstract-short" style="display: inline;"> Deep learning-based segmentation methods are widely utilized for detecting lesions in ultrasound images. Throughout the imaging procedure, the attenuation and scattering of ultrasound waves cause contour blurring and the formation of artifacts, limiting the clarity of the acquired ultrasound images. To overcome this challenge, we propose a contour-based probabilistic segmentation model CP-UNet, wh&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14250v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14250v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14250v1-abstract-full" style="display: none;"> Deep learning-based segmentation methods are widely utilized for detecting lesions in ultrasound images. Throughout the imaging procedure, the attenuation and scattering of ultrasound waves cause contour blurring and the formation of artifacts, limiting the clarity of the acquired ultrasound images. To overcome this challenge, we propose a contour-based probabilistic segmentation model CP-UNet, which guides the segmentation network to enhance its focus on contour during decoding. We design a novel down-sampling module to enable the contour probability distribution modeling and encoding stages to acquire global-local features. Furthermore, the Gaussian Mixture Model utilizes optimized features to model the contour distribution, capturing the uncertainty of lesion boundaries. Extensive experiments with several state-of-the-art deep learning segmentation methods on three ultrasound image datasets show that our method performs better on breast and thyroid lesions segmentation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14250v1-abstract-full').style.display = 'none'; document.getElementById('2411.14250v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">4 pages, 4 figures, 2 tables;For icassp2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13878">arXiv:2411.13878</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.13878">pdf</a>, <a href="https://arxiv.org/format/2411.13878">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Sparse Zero Correlation Zone Arrays for Training Design in Spatial Modulation Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Pai%2C+C">Cheng-Yu Pai</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zilong Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+C">Chao-Yu Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13878v1-abstract-short" style="display: inline;"> This paper presents a novel training matrix design for spatial modulation (SM) systems, by introducing a new class of two-dimensional (2D) arrays called sparse zero correlation zone (SZCZ) arrays. An SZCZ array is characterized by a majority of zero entries and exhibits the zero periodic auto- and cross-correlation zone properties across any two rows. With these unique properties, we show that SZC&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13878v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13878v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13878v1-abstract-full" style="display: none;"> This paper presents a novel training matrix design for spatial modulation (SM) systems, by introducing a new class of two-dimensional (2D) arrays called sparse zero correlation zone (SZCZ) arrays. An SZCZ array is characterized by a majority of zero entries and exhibits the zero periodic auto- and cross-correlation zone properties across any two rows. With these unique properties, we show that SZCZ arrays can be effectively used as training matrices for SM systems. Additionally, direct constructions of SZCZ arrays with large ZCZ widths and controllable sparsity levels based on 2D restricted generalized Boolean functions (RGBFs) are proposed. Compared with existing training schemes, the proposed SZCZ-based training matrices have larger ZCZ widths, thereby offering greater tolerance for delay spread in multipath channels. Simulation results demonstrate that the proposed SZCZ-based training design exhibits superior channel estimation performance over frequency-selective fading channels compared to existing alternatives. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13878v1-abstract-full').style.display = 'none'; document.getElementById('2411.13878v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13826">arXiv:2411.13826</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.13826">pdf</a>, <a href="https://arxiv.org/format/2411.13826">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Interactive and Expressive Code-Augmented Planning with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+A+Z">Anthony Z. Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+X">Xinhe Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Sansom%2C+J">Jacob Sansom</a>, <a href="/search/cs?searchtype=author&amp;query=Fu%2C+Y">Yao Fu</a>, <a href="/search/cs?searchtype=author&amp;query=Choi%2C+J">Jongwook Choi</a>, <a href="/search/cs?searchtype=author&amp;query=Sohn%2C+S">Sungryull Sohn</a>, <a href="/search/cs?searchtype=author&amp;query=Kim%2C+J">Jaekyeom Kim</a>, <a href="/search/cs?searchtype=author&amp;query=Lee%2C+H">Honglak Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13826v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) demonstrate strong abilities in common-sense reasoning and interactive decision-making, but often struggle with complex, long-horizon planning tasks. Recent techniques have sought to structure LLM outputs using control flow and other code-adjacent techniques to improve planning performance. These techniques include using variables (to track important information) and f&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13826v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13826v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13826v1-abstract-full" style="display: none;"> Large Language Models (LLMs) demonstrate strong abilities in common-sense reasoning and interactive decision-making, but often struggle with complex, long-horizon planning tasks. Recent techniques have sought to structure LLM outputs using control flow and other code-adjacent techniques to improve planning performance. These techniques include using variables (to track important information) and functions (to divide complex tasks into smaller re-usable sub-tasks). However, purely code-based approaches can be error-prone and insufficient for handling ambiguous or unstructured data. To address these challenges, we propose REPL-Plan, an LLM planning approach that is fully code-expressive (it can utilize all the benefits of code) while also being dynamic (it can flexibly adapt from errors and use the LLM for fuzzy situations). In REPL-Plan, an LLM solves tasks by interacting with a Read-Eval-Print Loop (REPL), which iteratively executes and evaluates code, similar to language shells or interactive code notebooks, allowing the model to flexibly correct errors and handle tasks dynamically. We demonstrate that REPL-Plan achieves strong results across various planning domains compared to previous methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13826v1-abstract-full').style.display = 'none'; document.getElementById('2411.13826v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13814">arXiv:2411.13814</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.13814">pdf</a>, <a href="https://arxiv.org/format/2411.13814">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AutoMixQ: Self-Adjusting Quantization for High Performance Memory-Efficient Fine-Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+C">Changhai Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+S">Shiyang Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+Y">Yuhua Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zekai Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Weng%2C+S">Shichao Weng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13814v1-abstract-short" style="display: inline;"> Fine-tuning large language models (LLMs) under resource constraints is a significant challenge in deep learning. Low-Rank Adaptation (LoRA), pruning, and quantization are all effective methods for improving resource efficiency. However, combining them directly often results in suboptimal performance, especially with uniform quantization across all model layers. This is due to the complex, uneven i&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13814v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13814v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13814v1-abstract-full" style="display: none;"> Fine-tuning large language models (LLMs) under resource constraints is a significant challenge in deep learning. Low-Rank Adaptation (LoRA), pruning, and quantization are all effective methods for improving resource efficiency. However, combining them directly often results in suboptimal performance, especially with uniform quantization across all model layers. This is due to the complex, uneven interlayer relationships introduced by pruning, necessitating more refined quantization strategies. To address this, we propose AutoMixQ, an end-to-end optimization framework that selects optimal quantization configurations for each LLM layer. AutoMixQ leverages lightweight performance models to guide the selection process, significantly reducing time and computational resources compared to exhaustive search methods. By incorporating Pareto optimality, AutoMixQ balances memory usage and performance, approaching the upper bounds of model capability under strict resource constraints. Our experiments on widely used benchmarks show that AutoMixQ reduces memory consumption while achieving superior performance. For example, at a 30\% pruning rate in LLaMA-7B, AutoMixQ achieved 66.21\% on BoolQ compared to 62.45\% for LoRA and 58.96\% for LoftQ, while reducing memory consumption by 35.5\% compared to LoRA and 27.5\% compared to LoftQ. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13814v1-abstract-full').style.display = 'none'; document.getElementById('2411.13814v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13611">arXiv:2411.13611</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.13611">pdf</a>, <a href="https://arxiv.org/format/2411.13611">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> DSTC: Direct Preference Learning with Only Self-Generated Tests and Code to Improve Code LMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhihan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+S">Shenao Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zhaoran Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13611v2-abstract-short" style="display: inline;"> Direct preference learning offers a promising and computation-efficient beyond supervised fine-tuning (SFT) for improving code generation in coding large language models (LMs). However, the scarcity of reliable preference data is a bottleneck for the performance of direct preference learning to improve the coding accuracy of code LMs. In this paper, we introduce \underline{\textbf{D}}irect Prefere&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13611v2-abstract-full').style.display = 'inline'; document.getElementById('2411.13611v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13611v2-abstract-full" style="display: none;"> Direct preference learning offers a promising and computation-efficient beyond supervised fine-tuning (SFT) for improving code generation in coding large language models (LMs). However, the scarcity of reliable preference data is a bottleneck for the performance of direct preference learning to improve the coding accuracy of code LMs. In this paper, we introduce \underline{\textbf{D}}irect Preference Learning with Only \underline{\textbf{S}}elf-Generated \underline{\textbf{T}}ests and \underline{\textbf{C}}ode (DSTC), a framework that leverages only self-generated code snippets and tests to construct reliable preference pairs such that direct preference learning can improve LM coding accuracy without external annotations. DSTC combines a minimax selection process and test-code concatenation to improve preference pair quality, reducing the influence of incorrect self-generated tests and enhancing model performance without the need for costly reward models. When applied with direct preference learning methods such as Direct Preference Optimization (DPO) and Kahneman-Tversky Optimization (KTO), DSTC yields stable improvements in coding accuracy (pass@1 score) across diverse coding benchmarks, including HumanEval, MBPP, and BigCodeBench, demonstrating both its effectiveness and scalability for models of various sizes. This approach autonomously enhances code generation accuracy across LLMs of varying sizes, reducing reliance on expensive annotated coding datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13611v2-abstract-full').style.display = 'none'; document.getElementById('2411.13611v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">We temporarily modified the author list because of the pending verification from the company</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13602">arXiv:2411.13602</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.13602">pdf</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Large-scale cross-modality pretrained model enhances cardiovascular state estimation and cardiomyopathy detection from electrocardiograms: An AI system development and multi-center validation study </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Ding%2C+Z">Zhengyao Ding</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+Y">Yujian Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Y">Youyao Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+C">Chengchen Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Ziyu Li</a>, <a href="/search/cs?searchtype=author&amp;query=Mao%2C+Y">Yiheng Mao</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+H">Haitao Li</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Q">Qian Li</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jing Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Yue Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+M">Mengjia Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Longbo Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Chu%2C+X">Xuesen Chu</a>, <a href="/search/cs?searchtype=author&amp;query=Pan%2C+W">Weichao Pan</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Ziyi Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Wu%2C+F">Fei Wu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+H">Hongkun Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+T">Ting Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Z">Zhengxing Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13602v1-abstract-short" style="display: inline;"> Cardiovascular diseases (CVDs) present significant challenges for early and accurate diagnosis. While cardiac magnetic resonance imaging (CMR) is the gold standard for assessing cardiac function and diagnosing CVDs, its high cost and technical complexity limit accessibility. In contrast, electrocardiography (ECG) offers promise for large-scale early screening. This study introduces CardiacNets, an&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13602v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13602v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13602v1-abstract-full" style="display: none;"> Cardiovascular diseases (CVDs) present significant challenges for early and accurate diagnosis. While cardiac magnetic resonance imaging (CMR) is the gold standard for assessing cardiac function and diagnosing CVDs, its high cost and technical complexity limit accessibility. In contrast, electrocardiography (ECG) offers promise for large-scale early screening. This study introduces CardiacNets, an innovative model that enhances ECG analysis by leveraging the diagnostic strengths of CMR through cross-modal contrastive learning and generative pretraining. CardiacNets serves two primary functions: (1) it evaluates detailed cardiac function indicators and screens for potential CVDs, including coronary artery disease, cardiomyopathy, pericarditis, heart failure and pulmonary hypertension, using ECG input; and (2) it enhances interpretability by generating high-quality CMR images from ECG data. We train and validate the proposed CardiacNets on two large-scale public datasets (the UK Biobank with 41,519 individuals and the MIMIC-IV-ECG comprising 501,172 samples) as well as three private datasets (FAHZU with 410 individuals, SAHZU with 464 individuals, and QPH with 338 individuals), and the findings demonstrate that CardiacNets consistently outperforms traditional ECG-only models, substantially improving screening accuracy. Furthermore, the generated CMR images provide valuable diagnostic support for physicians of all experience levels. This proof-of-concept study highlights how ECG can facilitate cross-modal insights into cardiac function assessment, paving the way for enhanced CVD screening and diagnosis at a population level. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13602v1-abstract-full').style.display = 'none'; document.getElementById('2411.13602v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13578">arXiv:2411.13578</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.13578">pdf</a>, <a href="https://arxiv.org/format/2411.13578">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> COOD: Concept-based Zero-shot OOD Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhendong Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Nian%2C+Y">Yi Nian</a>, <a href="/search/cs?searchtype=author&amp;query=Zou%2C+H+P">Henry Peng Zou</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+L">Li Li</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+X">Xiyang Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+Y">Yue Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13578v1-abstract-short" style="display: inline;"> How can models effectively detect out-of-distribution (OOD) samples in complex, multi-label settings without extensive retraining? Existing OOD detection methods struggle to capture the intricate semantic relationships and label co-occurrences inherent in multi-label settings, often requiring large amounts of training data and failing to generalize to unseen label combinations. While large languag&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13578v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13578v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13578v1-abstract-full" style="display: none;"> How can models effectively detect out-of-distribution (OOD) samples in complex, multi-label settings without extensive retraining? Existing OOD detection methods struggle to capture the intricate semantic relationships and label co-occurrences inherent in multi-label settings, often requiring large amounts of training data and failing to generalize to unseen label combinations. While large language models have revolutionized zero-shot OOD detection, they primarily focus on single-label scenarios, leaving a critical gap in handling real-world tasks where samples can be associated with multiple interdependent labels. To address these challenges, we introduce COOD, a novel zero-shot multi-label OOD detection framework. COOD leverages pre-trained vision-language models, enhancing them with a concept-based label expansion strategy and a new scoring function. By enriching the semantic space with both positive and negative concepts for each label, our approach models complex label dependencies, precisely differentiating OOD samples without the need for additional training. Extensive experiments demonstrate that our method significantly outperforms existing approaches, achieving approximately 95% average AUROC on both VOC and COCO datasets, while maintaining robust performance across varying numbers of labels and different types of OOD samples. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13578v1-abstract-full').style.display = 'none'; document.getElementById('2411.13578v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13547">arXiv:2411.13547</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.13547">pdf</a>, <a href="https://arxiv.org/format/2411.13547">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SpecTool: A Benchmark for Characterizing Errors in Tool-Use LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Kokane%2C+S">Shirley Kokane</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+M">Ming Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Awalgaonkar%2C+T">Tulika Awalgaonkar</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jianguo Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Hoang%2C+T">Thai Hoang</a>, <a href="/search/cs?searchtype=author&amp;query=Prabhakar%2C+A">Akshara Prabhakar</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zuxin Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Lan%2C+T">Tian Lan</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+L">Liangwei Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Tan%2C+J">Juntao Tan</a>, <a href="/search/cs?searchtype=author&amp;query=Murthy%2C+R">Rithesh Murthy</a>, <a href="/search/cs?searchtype=author&amp;query=Yao%2C+W">Weiran Yao</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhiwei Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Niebles%2C+J+C">Juan Carlos Niebles</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+H">Huan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Heinecke%2C+S">Shelby Heinecke</a>, <a href="/search/cs?searchtype=author&amp;query=Xiong%2C+C">Caiming Xiong</a>, <a href="/search/cs?searchtype=author&amp;query=Savarese%2C+S">Silivo Savarese</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13547v1-abstract-short" style="display: inline;"> Evaluating the output of Large Language Models (LLMs) is one of the most critical aspects of building a performant compound AI system. Since the output from LLMs propagate to downstream steps, identifying LLM errors is crucial to system performance. A common task for LLMs in AI systems is tool use. While there are several benchmark environments for evaluating LLMs on this task, they typically only&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13547v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13547v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13547v1-abstract-full" style="display: none;"> Evaluating the output of Large Language Models (LLMs) is one of the most critical aspects of building a performant compound AI system. Since the output from LLMs propagate to downstream steps, identifying LLM errors is crucial to system performance. A common task for LLMs in AI systems is tool use. While there are several benchmark environments for evaluating LLMs on this task, they typically only give a success rate without any explanation of the failure cases. To solve this problem, we introduce SpecTool, a new benchmark to identify error patterns in LLM output on tool-use tasks. Our benchmark data set comprises of queries from diverse environments that can be used to test for the presence of seven newly characterized error patterns. Using SPECTOOL , we show that even the most prominent LLMs exhibit these error patterns in their outputs. Researchers can use the analysis and insights from SPECTOOL to guide their error mitigation strategies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13547v1-abstract-full').style.display = 'none'; document.getElementById('2411.13547v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13503">arXiv:2411.13503</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.13503">pdf</a>, <a href="https://arxiv.org/format/2411.13503">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VBench++: Comprehensive and Versatile Benchmark Suite for Video Generative Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Huang%2C+Z">Ziqi Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+F">Fan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+X">Xiaojie Xu</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+Y">Yinan He</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+J">Jiashuo Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Dong%2C+Z">Ziyue Dong</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+Q">Qianli Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Chanpaisit%2C+N">Nattapol Chanpaisit</a>, <a href="/search/cs?searchtype=author&amp;query=Si%2C+C">Chenyang Si</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+Y">Yuming Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Y">Yaohui Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+X">Xinyuan Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Y">Ying-Cong Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+L">Limin Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+D">Dahua Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Ziwei Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13503v1-abstract-short" style="display: inline;"> Video generation has witnessed significant advancements, yet evaluating these models remains a challenge. A comprehensive evaluation benchmark for video generation is indispensable for two reasons: 1) Existing metrics do not fully align with human perceptions; 2) An ideal evaluation system should provide insights to inform future developments of video generation. To this end, we present VBench, a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13503v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13503v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13503v1-abstract-full" style="display: none;"> Video generation has witnessed significant advancements, yet evaluating these models remains a challenge. A comprehensive evaluation benchmark for video generation is indispensable for two reasons: 1) Existing metrics do not fully align with human perceptions; 2) An ideal evaluation system should provide insights to inform future developments of video generation. To this end, we present VBench, a comprehensive benchmark suite that dissects &#34;video generation quality&#34; into specific, hierarchical, and disentangled dimensions, each with tailored prompts and evaluation methods. VBench has several appealing properties: 1) Comprehensive Dimensions: VBench comprises 16 dimensions in video generation (e.g., subject identity inconsistency, motion smoothness, temporal flickering, and spatial relationship, etc). The evaluation metrics with fine-grained levels reveal individual models&#39; strengths and weaknesses. 2) Human Alignment: We also provide a dataset of human preference annotations to validate our benchmarks&#39; alignment with human perception, for each evaluation dimension respectively. 3) Valuable Insights: We look into current models&#39; ability across various evaluation dimensions, and various content types. We also investigate the gaps between video and image generation models. 4) Versatile Benchmarking: VBench++ supports evaluating text-to-video and image-to-video. We introduce a high-quality Image Suite with an adaptive aspect ratio to enable fair evaluations across different image-to-video generation settings. Beyond assessing technical quality, VBench++ evaluates the trustworthiness of video generative models, providing a more holistic view of model performance. 5) Full Open-Sourcing: We fully open-source VBench++ and continually add new video generation models to our leaderboard to drive forward the field of video generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13503v1-abstract-full').style.display = 'none'; document.getElementById('2411.13503v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Leaderboard: https://huggingface.co/spaces/Vchitect/VBench_Leaderboard Code: https://github.com/Vchitect/VBench Project page: https://vchitect.github.io/VBench-project/ extension of arXiv:2311.17982. arXiv admin note: substantial text overlap with arXiv:2311.17982</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12915">arXiv:2411.12915</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.12915">pdf</a>, <a href="https://arxiv.org/format/2411.12915">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VILA-M3: Enhancing Vision-Language Models with Medical Expert Knowledge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Nath%2C+V">Vishwesh Nath</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+W">Wenqi Li</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+D">Dong Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Myronenko%2C+A">Andriy Myronenko</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+M">Mingxin Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Lu%2C+Y">Yao Lu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhijian Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Yin%2C+H">Hongxu Yin</a>, <a href="/search/cs?searchtype=author&amp;query=Law%2C+Y+M">Yee Man Law</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+Y">Yucheng Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Guo%2C+P">Pengfei Guo</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+C">Can Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+Z">Ziyue Xu</a>, <a href="/search/cs?searchtype=author&amp;query=He%2C+Y">Yufan He</a>, <a href="/search/cs?searchtype=author&amp;query=Heinrich%2C+G">Greg Heinrich</a>, <a href="/search/cs?searchtype=author&amp;query=Aylward%2C+S">Stephen Aylward</a>, <a href="/search/cs?searchtype=author&amp;query=Edgar%2C+M">Marc Edgar</a>, <a href="/search/cs?searchtype=author&amp;query=Zephyr%2C+M">Michael Zephyr</a>, <a href="/search/cs?searchtype=author&amp;query=Molchanov%2C+P">Pavlo Molchanov</a>, <a href="/search/cs?searchtype=author&amp;query=Turkbey%2C+B">Baris Turkbey</a>, <a href="/search/cs?searchtype=author&amp;query=Roth%2C+H">Holger Roth</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+D">Daguang Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12915v1-abstract-short" style="display: inline;"> Generalist vision language models (VLMs) have made significant strides in computer vision, but they fall short in specialized fields like healthcare, where expert knowledge is essential. In traditional computer vision tasks, creative or approximate answers may be acceptable, but in healthcare, precision is paramount.Current large multimodal models like Gemini and GPT-4o are insufficient for medica&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12915v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12915v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12915v1-abstract-full" style="display: none;"> Generalist vision language models (VLMs) have made significant strides in computer vision, but they fall short in specialized fields like healthcare, where expert knowledge is essential. In traditional computer vision tasks, creative or approximate answers may be acceptable, but in healthcare, precision is paramount.Current large multimodal models like Gemini and GPT-4o are insufficient for medical tasks due to their reliance on memorized internet knowledge rather than the nuanced expertise required in healthcare. VLMs are usually trained in three stages: vision pre-training, vision-language pre-training, and instruction fine-tuning (IFT). IFT has been typically applied using a mixture of generic and healthcare data. In contrast, we propose that for medical VLMs, a fourth stage of specialized IFT is necessary, which focuses on medical data and includes information from domain expert models. Domain expert models developed for medical use are crucial because they are specifically trained for certain clinical tasks, e.g. to detect tumors and classify abnormalities through segmentation and classification, which learn fine-grained features of medical data$-$features that are often too intricate for a VLM to capture effectively especially in radiology. This paper introduces a new framework, VILA-M3, for medical VLMs that utilizes domain knowledge via expert models. Through our experiments, we show an improved state-of-the-art (SOTA) performance with an average improvement of ~9% over the prior SOTA model Med-Gemini and ~6% over models trained on the specific tasks. Our approach emphasizes the importance of domain expertise in creating precise, reliable VLMs for medical applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12915v1-abstract-full').style.display = 'none'; document.getElementById('2411.12915v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12841">arXiv:2411.12841</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.12841">pdf</a>, <a href="https://arxiv.org/format/2411.12841">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Data-to-Model Distillation: Data-Efficient Learning Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Sajedi%2C+A">Ahmad Sajedi</a>, <a href="/search/cs?searchtype=author&amp;query=Khaki%2C+S">Samir Khaki</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+L+Z">Lucy Z. Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Amjadian%2C+E">Ehsan Amjadian</a>, <a href="/search/cs?searchtype=author&amp;query=Lawryshyn%2C+Y+A">Yuri A. Lawryshyn</a>, <a href="/search/cs?searchtype=author&amp;query=Plataniotis%2C+K+N">Konstantinos N. Plataniotis</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12841v1-abstract-short" style="display: inline;"> Dataset distillation aims to distill the knowledge of a large-scale real dataset into small yet informative synthetic data such that a model trained on it performs as well as a model trained on the full dataset. Despite recent progress, existing dataset distillation methods often struggle with computational efficiency, scalability to complex high-resolution datasets, and generalizability to deep a&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12841v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12841v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12841v1-abstract-full" style="display: none;"> Dataset distillation aims to distill the knowledge of a large-scale real dataset into small yet informative synthetic data such that a model trained on it performs as well as a model trained on the full dataset. Despite recent progress, existing dataset distillation methods often struggle with computational efficiency, scalability to complex high-resolution datasets, and generalizability to deep architectures. These approaches typically require retraining when the distillation ratio changes, as knowledge is embedded in raw pixels. In this paper, we propose a novel framework called Data-to-Model Distillation (D2M) to distill the real dataset&#39;s knowledge into the learnable parameters of a pre-trained generative model by aligning rich representations extracted from real and generated images. The learned generative model can then produce informative training images for different distillation ratios and deep architectures. Extensive experiments on 15 datasets of varying resolutions show D2M&#39;s superior performance, re-distillation efficiency, and cross-architecture generalizability. Our method effectively scales up to high-resolution 128x128 ImageNet-1K. Furthermore, we verify D2M&#39;s practical benefits for downstream applications in neural architecture search. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12841v1-abstract-full').style.display = 'none'; document.getElementById('2411.12841v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted in the 18th European Conference on Computer Vision (ECCV 2024), Milan, Italy, September 29 October 4, 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12773">arXiv:2411.12773</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.12773">pdf</a>, <a href="https://arxiv.org/format/2411.12773">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Decoupling Training-Free Guided Diffusion by ADMM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Y">Youyuan Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zehua Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zenan Li</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Z">Zhaoyu Li</a>, <a href="/search/cs?searchtype=author&amp;query=Clark%2C+J+J">James J. Clark</a>, <a href="/search/cs?searchtype=author&amp;query=Si%2C+X">Xujie Si</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12773v1-abstract-short" style="display: inline;"> In this paper, we consider the conditional generation problem by guiding off-the-shelf unconditional diffusion models with differentiable loss functions in a plug-and-play fashion. While previous research has primarily focused on balancing the unconditional diffusion model and the guided loss through a tuned weight hyperparameter, we propose a novel framework that distinctly decouples these two co&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12773v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12773v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12773v1-abstract-full" style="display: none;"> In this paper, we consider the conditional generation problem by guiding off-the-shelf unconditional diffusion models with differentiable loss functions in a plug-and-play fashion. While previous research has primarily focused on balancing the unconditional diffusion model and the guided loss through a tuned weight hyperparameter, we propose a novel framework that distinctly decouples these two components. Specifically, we introduce two variables ${x}$ and ${z}$, to represent the generated samples governed by the unconditional generation model and the guidance function, respectively. This decoupling reformulates conditional generation into two manageable subproblems, unified by the constraint ${x} = {z}$. Leveraging this setup, we develop a new algorithm based on the Alternating Direction Method of Multipliers (ADMM) to adaptively balance these components. Additionally, we establish the equivalence between the diffusion reverse step and the proximal operator of ADMM and provide a detailed convergence analysis of our algorithm under certain mild assumptions. Our experiments demonstrate that our proposed method ADMMDiff consistently generates high-quality samples while ensuring strong adherence to the conditioning criteria. It outperforms existing methods across a range of conditional generation tasks, including image generation with various guidance and controllable motion synthesis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12773v1-abstract-full').style.display = 'none'; document.getElementById('2411.12773v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12360">arXiv:2411.12360</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.12360">pdf</a>, <a href="https://arxiv.org/ps/2411.12360">ps</a>, <a href="https://arxiv.org/format/2411.12360">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> </div> </div> <p class="title is-5 mathjax"> An Affine Equivalence Algorithm for S-boxes based on Matrix Invariants </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Hu%2C+X">Xincheng Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Zeng%2C+X">Xiao Zeng</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhaoqiang Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+G">Guowu Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12360v1-abstract-short" style="display: inline;"> We investigate the affine equivalence (AE) problem of S-boxes. Given two S-boxes denoted as $S_1$ and $S_2$, we aim to seek two invertible AE transformations $A,B$ such that $S_1\circ A = B\circ S_2$ holds. Due to important applications in the analysis and design of block ciphers, the investigation of AE algorithms has performed growing significance. In this paper, we propose zeroization on S-bo&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12360v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12360v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12360v1-abstract-full" style="display: none;"> We investigate the affine equivalence (AE) problem of S-boxes. Given two S-boxes denoted as $S_1$ and $S_2$, we aim to seek two invertible AE transformations $A,B$ such that $S_1\circ A = B\circ S_2$ holds. Due to important applications in the analysis and design of block ciphers, the investigation of AE algorithms has performed growing significance. In this paper, we propose zeroization on S-box firstly, and the AE problem can be transformed into $2^n$ linear equivalence problems by this zeroization operation. Secondly, we propose standard orthogonal spatial matrix (SOSM), and the rank of the SOSM is invariant under AE transformations. Finally, based on the zeroization operation and the SOSM method, we propose a depth first search (DFS) method for determining AE of S-boxes, named the AE\_SOSM\_DFS algorithm. Using this matrix invariant, we optimize the temporal complexity of the algorithm to approximately $\frac{1}{2^n}$ of the complexity without SOSM. Specifically, the complexity of our algorithm is $O(2^{3n})$. In addition, we also conducted experiments with non-invertible S-boxes, and the performance is similar to that of invertible S-boxes. Moreover, our proposed algorithm can effectively handle S-boxes with low algebraic degree or certain popular S-boxes such as namely AES and ARIA\_s2, which are difficult to be handled by the algorithm proposed by Dinur (2018). Using our algorithm, it only takes 5.5 seconds to find out that the seven popular S-boxes namely AES, ARIA\_s2, Camellia, Chiasmus, DBlock, SEED\_S0, and SMS4 are affine equivalent and the AE transformations of these S-boxes are provided. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12360v1-abstract-full').style.display = 'none'; document.getElementById('2411.12360v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12196">arXiv:2411.12196</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.12196">pdf</a>, <a href="https://arxiv.org/format/2411.12196">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A More Advanced Group Polarization Measurement Approach Based on LLM-Based Agents and Graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zixin Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Ji Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Ding%2C+Y">Yiran Ding</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12196v1-abstract-short" style="display: inline;"> Group polarization is an important research direction in social media content analysis, attracting many researchers to explore this field. Therefore, how to effectively measure group polarization has become a critical topic. Measuring group polarization on social media presents several challenges that have not yet been addressed by existing solutions. First, social media group polarization measure&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12196v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12196v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12196v1-abstract-full" style="display: none;"> Group polarization is an important research direction in social media content analysis, attracting many researchers to explore this field. Therefore, how to effectively measure group polarization has become a critical topic. Measuring group polarization on social media presents several challenges that have not yet been addressed by existing solutions. First, social media group polarization measurement involves processing vast amounts of text, which poses a significant challenge for information extraction. Second, social media texts often contain hard-to-understand content, including sarcasm, memes, and internet slang. Additionally, group polarization research focuses on holistic analysis, while texts is typically fragmented. To address these challenges, we designed a solution based on a multi-agent system and used a graph-structured Community Sentiment Network (CSN) to represent polarization states. Furthermore, we developed a metric called Community Opposition Index (COI) based on the CSN to quantify polarization. Finally, we tested our multi-agent system through a zero-shot stance detection task and achieved outstanding results. In summary, the proposed approach has significant value in terms of usability, accuracy, and interpretability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12196v1-abstract-full').style.display = 'none'; document.getElementById('2411.12196v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12156">arXiv:2411.12156</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.12156">pdf</a>, <a href="https://arxiv.org/format/2411.12156">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> HNCSE: Advancing Sentence Embeddings via Hybrid Contrastive Learning with Hard Negatives </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+W">Wenxiao Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Z">Zihong Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+C">Chaozhuo Li</a>, <a href="/search/cs?searchtype=author&amp;query=Hong%2C+Z">Zijin Hong</a>, <a href="/search/cs?searchtype=author&amp;query=Ma%2C+J">Jianfeng Ma</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhiquan Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+L">Litian Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+F">Feiran Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12156v1-abstract-short" style="display: inline;"> Unsupervised sentence representation learning remains a critical challenge in modern natural language processing (NLP) research. Recently, contrastive learning techniques have achieved significant success in addressing this issue by effectively capturing textual semantics. Many such approaches prioritize the optimization using negative samples. In fields such as computer vision, hard negative samp&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12156v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12156v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12156v1-abstract-full" style="display: none;"> Unsupervised sentence representation learning remains a critical challenge in modern natural language processing (NLP) research. Recently, contrastive learning techniques have achieved significant success in addressing this issue by effectively capturing textual semantics. Many such approaches prioritize the optimization using negative samples. In fields such as computer vision, hard negative samples (samples that are close to the decision boundary and thus more difficult to distinguish) have been shown to enhance representation learning. However, adapting hard negatives to contrastive sentence learning is complex due to the intricate syntactic and semantic details of text. To address this problem, we propose HNCSE, a novel contrastive learning framework that extends the leading SimCSE approach. The hallmark of HNCSE is its innovative use of hard negative samples to enhance the learning of both positive and negative samples, thereby achieving a deeper semantic understanding. Empirical tests on semantic textual similarity and transfer task datasets validate the superiority of HNCSE. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12156v1-abstract-full').style.display = 'none'; document.getElementById('2411.12156v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12147">arXiv:2411.12147</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.12147">pdf</a>, <a href="https://arxiv.org/format/2411.12147">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> CoMeDi Shared Task: Models as Annotators in Lexical Semantics Disagreements </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhu Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+Z">Zhen Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Ying Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12147v1-abstract-short" style="display: inline;"> We present the results of our system for the CoMeDi Shared Task, which predicts majority votes (Subtask 1) and annotator disagreements (Subtask 2). Our approach combines model ensemble strategies with MLP-based and threshold-based methods trained on pretrained language models. Treating individual models as virtual annotators, we simulate the annotation process by designing aggregation measures tha&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12147v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12147v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12147v1-abstract-full" style="display: none;"> We present the results of our system for the CoMeDi Shared Task, which predicts majority votes (Subtask 1) and annotator disagreements (Subtask 2). Our approach combines model ensemble strategies with MLP-based and threshold-based methods trained on pretrained language models. Treating individual models as virtual annotators, we simulate the annotation process by designing aggregation measures that incorporate continuous similarity scores and discrete classification labels to capture both majority and disagreement. Additionally, we employ anisotropy removal techniques to enhance performance. Experimental results demonstrate the effectiveness of our methods, particularly for Subtask 2. Notably, we find that continuous similarity scores, even within the same model, align better with human disagreement patterns compared to aggregated discrete labels. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12147v1-abstract-full').style.display = 'none'; document.getElementById('2411.12147v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12028">arXiv:2411.12028</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.12028">pdf</a>, <a href="https://arxiv.org/format/2411.12028">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> In-Situ Melt Pool Characterization via Thermal Imaging for Defect Detection in Directed Energy Deposition Using Vision Transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Era%2C+I+Z">Israt Zarin Era</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+F">Fan Zhou</a>, <a href="/search/cs?searchtype=author&amp;query=Raihan%2C+A+S">Ahmed Shoyeb Raihan</a>, <a href="/search/cs?searchtype=author&amp;query=Ahmed%2C+I">Imtiaz Ahmed</a>, <a href="/search/cs?searchtype=author&amp;query=Abul-Haj%2C+A">Alan Abul-Haj</a>, <a href="/search/cs?searchtype=author&amp;query=Craig%2C+J">James Craig</a>, <a href="/search/cs?searchtype=author&amp;query=Das%2C+S">Srinjoy Das</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhichao Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12028v1-abstract-short" style="display: inline;"> Directed Energy Deposition (DED) offers significant potential for manufacturing complex and multi-material parts. However, internal defects such as porosity and cracks can compromise mechanical properties and overall performance. This study focuses on in-situ monitoring and characterization of melt pools associated with porosity, aiming to improve defect detection and quality control in DED-printe&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12028v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12028v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12028v1-abstract-full" style="display: none;"> Directed Energy Deposition (DED) offers significant potential for manufacturing complex and multi-material parts. However, internal defects such as porosity and cracks can compromise mechanical properties and overall performance. This study focuses on in-situ monitoring and characterization of melt pools associated with porosity, aiming to improve defect detection and quality control in DED-printed parts. Traditional machine learning approaches for defect identification rely on extensive labeled datasets, often scarce and expensive to generate in real-world manufacturing. To address this, our framework employs self-supervised learning on unlabeled melt pool data using a Vision Transformer-based Masked Autoencoder (MAE) to produce highly representative embeddings. These fine-tuned embeddings are leveraged via transfer learning to train classifiers on a limited labeled dataset, enabling the effective identification of melt pool anomalies. We evaluate two classifiers: (1) a Vision Transformer (ViT) classifier utilizing the fine-tuned MAE Encoder&#39;s parameters and (2) the fine-tuned MAE Encoder combined with an MLP classifier head. Our framework achieves overall accuracy ranging from 95.44% to 99.17% and an average F1 score exceeding 80%, with the ViT Classifier slightly outperforming the MAE Encoder Classifier. This demonstrates the scalability and cost-effectiveness of our approach for automated quality control in DED, effectively detecting defects with minimal labeled data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12028v1-abstract-full').style.display = 'none'; document.getElementById('2411.12028v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11930">arXiv:2411.11930</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.11930">pdf</a>, <a href="https://arxiv.org/format/2411.11930">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AtomThink: A Slow Thinking Framework for Multimodal Mathematical Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Xiang%2C+K">Kun Xiang</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhili Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+Z">Zihao Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Yunshuang Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+R">Runhui Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Fan%2C+H">Haoxiang Fan</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+H">Hanhui Li</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+W">Weiran Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Zeng%2C+Y">Yihan Zeng</a>, <a href="/search/cs?searchtype=author&amp;query=Han%2C+J">Jianhua Han</a>, <a href="/search/cs?searchtype=author&amp;query=Hong%2C+L">Lanqing Hong</a>, <a href="/search/cs?searchtype=author&amp;query=Xu%2C+H">Hang Xu</a>, <a href="/search/cs?searchtype=author&amp;query=Liang%2C+X">Xiaodan Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11930v2-abstract-short" style="display: inline;"> In this paper, we address the challenging task of multimodal mathematical reasoning by incorporating the ability of ``slow thinking&#34; into multimodal large language models (MLLMs). Contrary to existing methods that rely on direct or fast thinking, our key idea is to construct long chains of thought (CoT) consisting of atomic actions in a step-by-step manner, guiding MLLMs to perform complex reasoni&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11930v2-abstract-full').style.display = 'inline'; document.getElementById('2411.11930v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11930v2-abstract-full" style="display: none;"> In this paper, we address the challenging task of multimodal mathematical reasoning by incorporating the ability of ``slow thinking&#34; into multimodal large language models (MLLMs). Contrary to existing methods that rely on direct or fast thinking, our key idea is to construct long chains of thought (CoT) consisting of atomic actions in a step-by-step manner, guiding MLLMs to perform complex reasoning. To this end, we design a novel AtomThink framework composed of three key modules: (i) a CoT annotation engine that automatically generates high-quality CoT annotations to address the lack of high-quality visual mathematical data; (ii) an atomic step fine-tuning strategy that jointly optimizes an MLLM and a policy reward model (PRM) for step-wise reasoning; and (iii) four different search strategies that can be applied with the PRM to complete reasoning. Additionally, we propose AtomMATH, a large-scale multimodal dataset of long CoTs, and an atomic capability evaluation metric for mathematical tasks. Extensive experimental results show that the proposed AtomThink significantly improves the performance of baseline MLLMs, achieving approximately 50\% relative accuracy gains on MathVista and 120\% on MathVerse. To support the advancement of multimodal slow-thinking models, we will make our code and dataset publicly available on https://github.com/Quinn777/AtomThink. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11930v2-abstract-full').style.display = 'none'; document.getElementById('2411.11930v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11910">arXiv:2411.11910</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.11910">pdf</a>, <a href="https://arxiv.org/ps/2411.11910">ps</a>, <a href="https://arxiv.org/format/2411.11910">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> AIGS: Generating Science from AI-Powered Automated Falsification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zijun Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+K">Kaiming Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+Y">Yiqi Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Lei%2C+X">Xuanyu Lei</a>, <a href="/search/cs?searchtype=author&amp;query=Yang%2C+Z">Zonghan Yang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+Z">Zhenhe Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+P">Peng Li</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Y">Yang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11910v2-abstract-short" style="display: inline;"> Rapid development of artificial intelligence has drastically accelerated the development of scientific discovery. Trained with large-scale observation data, deep neural networks extract the underlying patterns in an end-to-end manner and assist human researchers with highly-precised predictions in unseen scenarios. The recent rise of Large Language Models (LLMs) and the empowered autonomous agents&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11910v2-abstract-full').style.display = 'inline'; document.getElementById('2411.11910v2-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11910v2-abstract-full" style="display: none;"> Rapid development of artificial intelligence has drastically accelerated the development of scientific discovery. Trained with large-scale observation data, deep neural networks extract the underlying patterns in an end-to-end manner and assist human researchers with highly-precised predictions in unseen scenarios. The recent rise of Large Language Models (LLMs) and the empowered autonomous agents enable scientists to gain help through interaction in different stages of their research, including but not limited to literature review, research ideation, idea implementation, and academic writing. However, AI researchers instantiated by foundation model empowered agents with full-process autonomy are still in their infancy. In this paper, we study $\textbf{AI-Generated Science}$ (AIGS), where agents independently and autonomously complete the entire research process and discover scientific laws. By revisiting the definition of scientific research, we argue that $\textit{falsification}$ is the essence of both human research process and the design of an AIGS system. Through the lens of falsification, prior systems attempting towards AI-Generated Science either lack the part in their design, or rely heavily on existing verification engines that narrow the use in specialized domains. In this work, we propose Baby-AIGS as a baby-step demonstration of a full-process AIGS system, which is a multi-agent system with agents in roles representing key research process. By introducing FalsificationAgent, which identify and then verify possible scientific discoveries, we empower the system with explicit falsification. Experiments on three tasks preliminarily show that Baby-AIGS could produce meaningful scientific discoveries, though not on par with experienced human researchers. Finally, we discuss on the limitations of current Baby-AIGS, actionable insights, and related ethical issues in detail. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11910v2-abstract-full').style.display = 'none'; document.getElementById('2411.11910v2-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Pre-print. 35 pages. Official website: https://agent-force.github.io/AIGS/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11739">arXiv:2411.11739</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.11739">pdf</a>, <a href="https://arxiv.org/format/2411.11739">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> QARM: Quantitative Alignment Multi-Modal Recommendation at Kuaishou </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Luo%2C+X">Xinchen Luo</a>, <a href="/search/cs?searchtype=author&amp;query=Cao%2C+J">Jiangxia Cao</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+T">Tianyu Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Yu%2C+J">Jinkai Yu</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+R">Rui Huang</a>, <a href="/search/cs?searchtype=author&amp;query=Yuan%2C+W">Wei Yuan</a>, <a href="/search/cs?searchtype=author&amp;query=Lin%2C+H">Hezheng Lin</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+Y">Yichen Zheng</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+S">Shiyao Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Hu%2C+Q">Qigen Hu</a>, <a href="/search/cs?searchtype=author&amp;query=Qiu%2C+C">Changqing Qiu</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jiaqi Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+X">Xu Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+Z">Zhiheng Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+J">Jingming Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhang%2C+S">Simin Zhang</a>, <a href="/search/cs?searchtype=author&amp;query=Wen%2C+M">Mingxing Wen</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhaojie Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Gai%2C+K">Kun Gai</a>, <a href="/search/cs?searchtype=author&amp;query=Zhou%2C+G">Guorui Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11739v1-abstract-short" style="display: inline;"> In recent years, with the significant evolution of multi-modal large models, many recommender researchers realized the potential of multi-modal information for user interest modeling. In industry, a wide-used modeling architecture is a cascading paradigm: (1) first pre-training a multi-modal model to provide omnipotent representations for downstream services; (2) The downstream recommendation mode&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11739v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11739v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11739v1-abstract-full" style="display: none;"> In recent years, with the significant evolution of multi-modal large models, many recommender researchers realized the potential of multi-modal information for user interest modeling. In industry, a wide-used modeling architecture is a cascading paradigm: (1) first pre-training a multi-modal model to provide omnipotent representations for downstream services; (2) The downstream recommendation model takes the multi-modal representation as additional input to fit real user-item behaviours. Although such paradigm achieves remarkable improvements, however, there still exist two problems that limit model performance: (1) Representation Unmatching: The pre-trained multi-modal model is always supervised by the classic NLP/CV tasks, while the recommendation models are supervised by real user-item interaction. As a result, the two fundamentally different tasks&#39; goals were relatively separate, and there was a lack of consistent objective on their representations; (2) Representation Unlearning: The generated multi-modal representations are always stored in cache store and serve as extra fixed input of recommendation model, thus could not be updated by recommendation model gradient, further unfriendly for downstream training. Inspired by the two difficulties challenges in downstream tasks usage, we introduce a quantitative multi-modal framework to customize the specialized and trainable multi-modal information for different downstream models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11739v1-abstract-full').style.display = 'none'; document.getElementById('2411.11739v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> N/A </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11714">arXiv:2411.11714</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.11714">pdf</a>, <a href="https://arxiv.org/format/2411.11714">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Semantic-Geometric-Physical-Driven Robot Manipulation Skill Transfer via Skill Library and Tactile Representation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Qi%2C+M">Mingchao Qi</a>, <a href="/search/cs?searchtype=author&amp;query=Li%2C+Y">Yuanjin Li</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+X">Xing Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhengxiong Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Huang%2C+P">Panfeng Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11714v1-abstract-short" style="display: inline;"> Deploying robots in open-world environments involves complex tasks characterized by long sequences and rich interactions, necessitating efficient transfer of robotic skills across diverse and complex scenarios. To address this challenge, we propose a skill library framework based on knowledge graphs, which endows robots with high-level skill awareness and spatial semantic understanding. The framew&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11714v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11714v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11714v1-abstract-full" style="display: none;"> Deploying robots in open-world environments involves complex tasks characterized by long sequences and rich interactions, necessitating efficient transfer of robotic skills across diverse and complex scenarios. To address this challenge, we propose a skill library framework based on knowledge graphs, which endows robots with high-level skill awareness and spatial semantic understanding. The framework hierarchically organizes operational knowledge by constructing a &#34;task graph&#34; and a &#34;scene graph&#34; to represent task and scene semantic information, respectively. We introduce a &#34;state graph&#34; to facilitate interaction between high-level task planning and low-level scene information. Furthermore, we propose a hierarchical transfer framework for operational skills. At the task level, the framework integrates contextual learning and chain-of-thought prompting within a four-stage prompt paradigm, leveraging large language models&#39; (LLMs) reasoning and generalization capabilities to achieve task-level subtask sequence transfer. At the motion level, an adaptive trajectory transfer method is developed using the A* algorithm and the skill library, enabling motion-level adaptive trajectory transfer. At the physical level, we introduce an adaptive contour extraction and posture perception method based on tactile perception. This method dynamically obtains high-precision contour and posture information from visual-tactile texture data and adjusts transferred skills, such as contact positions and postures, to ensure effectiveness in new environments. Experimental results validate the effectiveness of the proposed methods. Project website:https://github.com/MingchaoQi/skill_transfer <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11714v1-abstract-full').style.display = 'none'; document.getElementById('2411.11714v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11694">arXiv:2411.11694</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.11694">pdf</a>, <a href="https://arxiv.org/format/2411.11694">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Technical Report: Enhancing LLM Reasoning with Reward-guided Tree Search </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Jiang%2C+J">Jinhao Jiang</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+Z">Zhipeng Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Min%2C+Y">Yingqian Min</a>, <a href="/search/cs?searchtype=author&amp;query=Chen%2C+J">Jie Chen</a>, <a href="/search/cs?searchtype=author&amp;query=Cheng%2C+X">Xiaoxue Cheng</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+J">Jiapeng Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Tang%2C+Y">Yiru Tang</a>, <a href="/search/cs?searchtype=author&amp;query=Sun%2C+H">Haoxiang Sun</a>, <a href="/search/cs?searchtype=author&amp;query=Deng%2C+J">Jia Deng</a>, <a href="/search/cs?searchtype=author&amp;query=Zhao%2C+W+X">Wayne Xin Zhao</a>, <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zheng Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Yan%2C+D">Dong Yan</a>, <a href="/search/cs?searchtype=author&amp;query=Xie%2C+J">Jian Xie</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+Z">Zhongyuan Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Wen%2C+J">Ji-Rong Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11694v1-abstract-short" style="display: inline;"> Recently, test-time scaling has garnered significant attention from the research community, largely due to the substantial advancements of the o1 model released by OpenAI. By allocating more computational resources during the inference phase, large language models~(LLMs) can extensively explore the solution space by generating more thought tokens or diverse solutions, thereby producing more accura&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11694v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11694v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11694v1-abstract-full" style="display: none;"> Recently, test-time scaling has garnered significant attention from the research community, largely due to the substantial advancements of the o1 model released by OpenAI. By allocating more computational resources during the inference phase, large language models~(LLMs) can extensively explore the solution space by generating more thought tokens or diverse solutions, thereby producing more accurate responses. However, developing an o1-like reasoning approach is challenging, and researchers have been making various attempts to advance this open area of research. In this paper, we present a preliminary exploration into enhancing the reasoning abilities of LLMs through reward-guided tree search algorithms. This framework is implemented by integrating the policy model, reward model, and search algorithm. It is primarily constructed around a tree search algorithm, where the policy model navigates a dynamically expanding tree guided by a specially trained reward model. We thoroughly explore various design considerations necessary for implementing this framework and provide a detailed report of the technical aspects. To assess the effectiveness of our approach, we focus on mathematical reasoning tasks and conduct extensive evaluations on four challenging datasets, significantly enhancing the reasoning abilities of LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11694v1-abstract-full').style.display = 'none'; document.getElementById('2411.11694v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">LLM;Complex Reasoning;Math</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11543">arXiv:2411.11543</a> <span>&nbsp;[<a href="https://arxiv.org/pdf/2411.11543">pdf</a>, <a href="https://arxiv.org/format/2411.11543">other</a>]&nbsp;</span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Vision-Language Model Safety through Progressive Concept-Bottleneck-Driven Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&amp;query=Liu%2C+Z">Zhendong Liu</a>, <a href="/search/cs?searchtype=author&amp;query=Nie%2C+Y">Yuanbi Nie</a>, <a href="/search/cs?searchtype=author&amp;query=Tan%2C+Y">Yingshui Tan</a>, <a href="/search/cs?searchtype=author&amp;query=Yue%2C+X">Xiangyu Yue</a>, <a href="/search/cs?searchtype=author&amp;query=Cui%2C+Q">Qiushi Cui</a>, <a href="/search/cs?searchtype=author&amp;query=Wang%2C+C">Chongjun Wang</a>, <a href="/search/cs?searchtype=author&amp;query=Zhu%2C+X">Xiaoyong Zhu</a>, <a href="/search/cs?searchtype=author&amp;query=Zheng%2C+B">Bo Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11543v1-abstract-short" style="display: inline;"> Benefiting from the powerful capabilities of Large Language Models (LLMs), pre-trained visual encoder models connected to LLMs form Vision Language Models (VLMs). However, recent research shows that the visual modality in VLMs is highly vulnerable, allowing attackers to bypass safety alignment in LLMs through visually transmitted content, launching harmful attacks. To address this challenge, we pr&hellip; <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11543v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11543v1-abstract-short').style.display = 'none';">&#9661; More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11543v1-abstract-full" style="display: none;"> Benefiting from the powerful capabilities of Large Language Models (LLMs), pre-trained visual encoder models connected to LLMs form Vision Language Models (VLMs). However, recent research shows that the visual modality in VLMs is highly vulnerable, allowing attackers to bypass safety alignment in LLMs through visually transmitted content, launching harmful attacks. To address this challenge, we propose a progressive concept-based alignment strategy, PSA-VLM, which incorporates safety modules as concept bottlenecks to enhance visual modality safety alignment. By aligning model predictions with specific safety concepts, we improve defenses against risky images, enhancing explainability and controllability while minimally impacting general performance. Our method is obtained through two-stage training. The low computational cost of the first stage brings very effective performance improvement, and the fine-tuning of the language model in the second stage further improves the safety performance. Our method achieves state-of-the-art results on popular VLM safety benchmark. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11543v1-abstract-full').style.display = 'none'; document.getElementById('2411.11543v1-abstract-short').style.display = 'inline';">&#9651; Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: substantial text overlap with arXiv:2405.13581</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&amp;query=Liu%2C+Z&amp;start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&amp;query=Liu%2C+Z&amp;start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Liu%2C+Z&amp;start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Liu%2C+Z&amp;start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Liu%2C+Z&amp;start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&amp;query=Liu%2C+Z&amp;start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">&hellip;</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>&nbsp;&nbsp;</span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10