Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 1,370 results for author: <span class="mathjax">Zhang, K</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Zhang%2C+K">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Zhang, K"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Zhang%2C+K&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Zhang, K"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhang%2C+K&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhang%2C+K&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+K&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+K&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+K&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+K&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14405">arXiv:2411.14405</a> <span> [<a href="https://arxiv.org/pdf/2411.14405">pdf</a>, <a href="https://arxiv.org/format/2411.14405">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Marco-o1: Towards Open Reasoning Models for Open-Ended Solutions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yu Zhao</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+H">Huifeng Yin</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+B">Bo Zeng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hao Wang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+T">Tianqi Shi</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+C">Chenyang Lyu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Longyue Wang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+W">Weihua Luo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kaifu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14405v1-abstract-short" style="display: inline;"> Currently OpenAI o1 has sparked a surge of interest in the study of large reasoning models (LRM). Building on this momentum, Marco-o1 not only focuses on disciplines with standard answers, such as mathematics, physics, and coding -- which are well-suited for reinforcement learning (RL) -- but also places greater emphasis on open-ended resolutions. We aim to address the question: "Can the o1 model… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14405v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14405v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14405v1-abstract-full" style="display: none;"> Currently OpenAI o1 has sparked a surge of interest in the study of large reasoning models (LRM). Building on this momentum, Marco-o1 not only focuses on disciplines with standard answers, such as mathematics, physics, and coding -- which are well-suited for reinforcement learning (RL) -- but also places greater emphasis on open-ended resolutions. We aim to address the question: "Can the o1 model effectively generalize to broader domains where clear standards are absent and rewards are challenging to quantify?" Marco-o1 is powered by Chain-of-Thought (CoT) fine-tuning, Monte Carlo Tree Search (MCTS), reflection mechanisms, and innovative reasoning strategies -- optimized for complex real-world problem-solving tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14405v1-abstract-full').style.display = 'none'; document.getElementById('2411.14405v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14384">arXiv:2411.14384</a> <span> [<a href="https://arxiv.org/pdf/2411.14384">pdf</a>, <a href="https://arxiv.org/format/2411.14384">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> Baking Gaussian Splatting into Diffusion Denoiser for Fast and Scalable Single-stage Image-to-3D Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cai%2C+Y">Yuanhao Cai</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">He Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yixun Liang</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+M">Mengwei Ren</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qing Liu</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S+Y">Soo Ye Kim</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianming Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhifei Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yuqian Zhou</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhe Lin</a>, <a href="/search/cs?searchtype=author&query=Yuille%2C+A">Alan Yuille</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14384v1-abstract-short" style="display: inline;"> Existing feed-forward image-to-3D methods mainly rely on 2D multi-view diffusion models that cannot guarantee 3D consistency. These methods easily collapse when changing the prompt view direction and mainly handle object-centric prompt images. In this paper, we propose a novel single-stage 3D diffusion model, DiffusionGS, for object and scene generation from a single view. DiffusionGS directly out… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14384v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14384v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14384v1-abstract-full" style="display: none;"> Existing feed-forward image-to-3D methods mainly rely on 2D multi-view diffusion models that cannot guarantee 3D consistency. These methods easily collapse when changing the prompt view direction and mainly handle object-centric prompt images. In this paper, we propose a novel single-stage 3D diffusion model, DiffusionGS, for object and scene generation from a single view. DiffusionGS directly outputs 3D Gaussian point clouds at each timestep to enforce view consistency and allow the model to generate robustly given prompt views of any directions, beyond object-centric inputs. Plus, to improve the capability and generalization ability of DiffusionGS, we scale up 3D training data by developing a scene-object mixed training strategy. Experiments show that our method enjoys better generation quality (2.20 dB higher in PSNR and 23.25 lower in FID) and over 5x faster speed (~6s on an A100 GPU) than SOTA methods. The user study and text-to-3D applications also reveals the practical values of our method. Our Project page at https://caiyuanhao1998.github.io/project/DiffusionGS/ shows the video and interactive generation results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14384v1-abstract-full').style.display = 'none'; document.getElementById('2411.14384v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">A novel one-stage 3DGS-based diffusion generates objects and scenes from a single view in ~6 seconds</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14249">arXiv:2411.14249</a> <span> [<a href="https://arxiv.org/pdf/2411.14249">pdf</a>, <a href="https://arxiv.org/format/2411.14249">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Towards a Physics Engine to Simulate Robotic Laser Surgery: Finite Element Modeling of Thermal Laser-Tissue Interactions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pacheco%2C+N+E">Nicholas E. Pacheco</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kang Zhang</a>, <a href="/search/cs?searchtype=author&query=Reyes%2C+A+S">Ashley S. Reyes</a>, <a href="/search/cs?searchtype=author&query=Pacheco%2C+C+J">Christopher J. Pacheco</a>, <a href="/search/cs?searchtype=author&query=Burstein%2C+L">Lucas Burstein</a>, <a href="/search/cs?searchtype=author&query=Fichera%2C+L">Loris Fichera</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14249v1-abstract-short" style="display: inline;"> This paper presents a computational model, based on the Finite Element Method (FEM), that simulates the thermal response of laser-irradiated tissue. This model addresses a gap in the current ecosystem of surgical robot simulators, which generally lack support for lasers and other energy-based end effectors. In the proposed model, the thermal dynamics of the tissue are calculated as the solution to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14249v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14249v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14249v1-abstract-full" style="display: none;"> This paper presents a computational model, based on the Finite Element Method (FEM), that simulates the thermal response of laser-irradiated tissue. This model addresses a gap in the current ecosystem of surgical robot simulators, which generally lack support for lasers and other energy-based end effectors. In the proposed model, the thermal dynamics of the tissue are calculated as the solution to a heat conduction problem with appropriate boundary conditions. The FEM formulation allows the model to capture complex phenomena, such as convection, which is crucial for creating realistic simulations. The accuracy of the model was verified via benchtop laser-tissue interaction experiments using agar tissue phantoms and ex-vivo chicken muscle. The results revealed an average root-mean-square error (RMSE) of less than 2 degrees Celsius across most experimental conditions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14249v1-abstract-full').style.display = 'none'; document.getElementById('2411.14249v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to the International Symposium on Medical Robotics 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13821">arXiv:2411.13821</a> <span> [<a href="https://arxiv.org/pdf/2411.13821">pdf</a>, <a href="https://arxiv.org/format/2411.13821">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3701551.3703568">10.1145/3701551.3703568 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Heterophilic Graph Neural Networks Optimization with Causal Message-passing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+B">Botao Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jia Li</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+H">Heng Chang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Keli Zhang</a>, <a href="/search/cs?searchtype=author&query=Tsung%2C+F">Fugee Tsung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13821v1-abstract-short" style="display: inline;"> In this work, we discover that causal inference provides a promising approach to capture heterophilic message-passing in Graph Neural Network (GNN). By leveraging cause-effect analysis, we can discern heterophilic edges based on asymmetric node dependency. The learned causal structure offers more accurate relationships among nodes. To reduce the computational complexity, we introduce intervention-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13821v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13821v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13821v1-abstract-full" style="display: none;"> In this work, we discover that causal inference provides a promising approach to capture heterophilic message-passing in Graph Neural Network (GNN). By leveraging cause-effect analysis, we can discern heterophilic edges based on asymmetric node dependency. The learned causal structure offers more accurate relationships among nodes. To reduce the computational complexity, we introduce intervention-based causal inference in graph learning. We first simplify causal analysis on graphs by formulating it as a structural learning model and define the optimization problem within the Bayesian scheme. We then present an analysis of decomposing the optimization target into a consistency penalty and a structure modification based on cause-effect relations. We then estimate this target by conditional entropy and present insights into how conditional entropy quantifies the heterophily. Accordingly, we propose CausalMP, a causal message-passing discovery network for heterophilic graph learning, that iteratively learns the explicit causal structure of input graphs. We conduct extensive experiments in both heterophilic and homophilic graph settings. The result demonstrates that the our model achieves superior link prediction performance. Training on causal structure can also enhance node representation in classification task across different base models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13821v1-abstract-full').style.display = 'none'; document.getElementById('2411.13821v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13549">arXiv:2411.13549</a> <span> [<a href="https://arxiv.org/pdf/2411.13549">pdf</a>, <a href="https://arxiv.org/format/2411.13549">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Generating 3D-Consistent Videos from Unposed Internet Photos </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chou%2C+G">Gene Chou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+S">Sai Bi</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+H">Hao Tan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zexiang Xu</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a>, <a href="/search/cs?searchtype=author&query=Hariharan%2C+B">Bharath Hariharan</a>, <a href="/search/cs?searchtype=author&query=Snavely%2C+N">Noah Snavely</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13549v1-abstract-short" style="display: inline;"> We address the problem of generating videos from unposed internet photos. A handful of input images serve as keyframes, and our model interpolates between them to simulate a path moving between the cameras. Given random images, a model's ability to capture underlying geometry, recognize scene identity, and relate frames in terms of camera position and orientation reflects a fundamental understandi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13549v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13549v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13549v1-abstract-full" style="display: none;"> We address the problem of generating videos from unposed internet photos. A handful of input images serve as keyframes, and our model interpolates between them to simulate a path moving between the cameras. Given random images, a model's ability to capture underlying geometry, recognize scene identity, and relate frames in terms of camera position and orientation reflects a fundamental understanding of 3D structure and scene layout. However, existing video models such as Luma Dream Machine fail at this task. We design a self-supervised method that takes advantage of the consistency of videos and variability of multiview internet photos to train a scalable, 3D-aware video model without any 3D annotations such as camera parameters. We validate that our method outperforms all baselines in terms of geometric and appearance consistency. We also show our model benefits applications that enable camera control, such as 3D Gaussian Splatting. Our results suggest that we can scale up scene-level 3D learning using only 2D data such as videos and multiview internet photos. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13549v1-abstract-full').style.display = 'none'; document.getElementById('2411.13549v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12882">arXiv:2411.12882</a> <span> [<a href="https://arxiv.org/pdf/2411.12882">pdf</a>, <a href="https://arxiv.org/format/2411.12882">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> ProSec: Fortifying Code LLMs with Proactive Security Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiangzhe Xu</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Z">Zian Su</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jinyao Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kaiyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhenting Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiangyu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12882v1-abstract-short" style="display: inline;"> Recent advances in code-specific large language models (LLMs) have greatly enhanced code generation and refinement capabilities. However, the safety of code LLMs remains under-explored, posing potential risks as insecure code generated by these models may introduce vulnerabilities into real-world systems. Previous work proposes to collect security-focused instruction-tuning dataset from real-world… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12882v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12882v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12882v1-abstract-full" style="display: none;"> Recent advances in code-specific large language models (LLMs) have greatly enhanced code generation and refinement capabilities. However, the safety of code LLMs remains under-explored, posing potential risks as insecure code generated by these models may introduce vulnerabilities into real-world systems. Previous work proposes to collect security-focused instruction-tuning dataset from real-world vulnerabilities. It is constrained by the data sparsity of vulnerable code, and has limited applicability in the iterative post-training workflows of modern LLMs. In this paper, we propose ProSec, a novel proactive security alignment approach designed to align code LLMs with secure coding practices. ProSec systematically exposes the vulnerabilities in a code LLM by synthesizing error-inducing coding scenarios from Common Weakness Enumerations (CWEs), and generates fixes to vulnerable code snippets, allowing the model to learn secure practices through advanced preference learning objectives. The scenarios synthesized by ProSec triggers 25 times more vulnerable code than a normal instruction-tuning dataset, resulting in a security-focused alignment dataset 7 times larger than the previous work. Experiments show that models trained with ProSec is 29.2% to 35.5% more secure compared to previous work, with a marginal negative effect of less than 2 percentage points on model's utility. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12882v1-abstract-full').style.display = 'none'; document.getElementById('2411.12882v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The first two authors contributed equally to this work</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11370">arXiv:2411.11370</a> <span> [<a href="https://arxiv.org/pdf/2411.11370">pdf</a>, <a href="https://arxiv.org/format/2411.11370">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TL-CLIP: A Power-specific Multimodal Pre-trained Visual Foundation Model for Transmission Line Defect Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Ke Zhang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zhaoye Zheng</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yurong Guo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiacun Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jiyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Y">Yangjie Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11370v1-abstract-short" style="display: inline;"> Transmission line defect recognition models have traditionally used general pre-trained weights as the initial basis for their training. These models often suffer weak generalization capability due to the lack of domain knowledge in the pre-training dataset. To address this issue, we propose a two-stage transmission-line-oriented contrastive language-image pre-training (TL-CLIP) framework, which l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11370v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11370v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11370v1-abstract-full" style="display: none;"> Transmission line defect recognition models have traditionally used general pre-trained weights as the initial basis for their training. These models often suffer weak generalization capability due to the lack of domain knowledge in the pre-training dataset. To address this issue, we propose a two-stage transmission-line-oriented contrastive language-image pre-training (TL-CLIP) framework, which lays a more effective foundation for transmission line defect recognition. The pre-training process employs a novel power-specific multimodal algorithm assisted with two power-specific pre-training tasks for better modeling the power-related semantic knowledge contained in the inspection data. To fine-tune the pre-trained model, we develop a transfer learning strategy, namely fine-tuning with pre-training objective (FTP), to alleviate the overfitting problem caused by limited inspection data. Experimental results demonstrate that the proposed method significantly improves the performance of transmission line defect recognition in both classification and detection tasks, indicating clear advantages over traditional pre-trained models in the scene of transmission line inspection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11370v1-abstract-full').style.display = 'none'; document.getElementById('2411.11370v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11045">arXiv:2411.11045</a> <span> [<a href="https://arxiv.org/pdf/2411.11045">pdf</a>, <a href="https://arxiv.org/format/2411.11045">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> StableV2V: Stablizing Shape Consistency in Video-to-Video Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chang Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Rui Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kaidong Zhang</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+Y">Yunwei Lan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dong Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11045v1-abstract-short" style="display: inline;"> Recent advancements of generative AI have significantly promoted content creation and editing, where prevailing studies further extend this exciting progress to video editing. In doing so, these studies mainly transfer the inherent motion patterns from the source videos to the edited ones, where results with inferior consistency to user prompts are often observed, due to the lack of particular ali… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11045v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11045v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11045v1-abstract-full" style="display: none;"> Recent advancements of generative AI have significantly promoted content creation and editing, where prevailing studies further extend this exciting progress to video editing. In doing so, these studies mainly transfer the inherent motion patterns from the source videos to the edited ones, where results with inferior consistency to user prompts are often observed, due to the lack of particular alignments between the delivered motions and edited contents. To address this limitation, we present a shape-consistent video editing method, namely StableV2V, in this paper. Our method decomposes the entire editing pipeline into several sequential procedures, where it edits the first video frame, then establishes an alignment between the delivered motions and user prompts, and eventually propagates the edited contents to all other frames based on such alignment. Furthermore, we curate a testing benchmark, namely DAVIS-Edit, for a comprehensive evaluation of video editing, considering various types of prompts and difficulties. Experimental results and analyses illustrate the outperforming performance, visual consistency, and inference efficiency of our method compared to existing state-of-the-art studies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11045v1-abstract-full').style.display = 'none'; document.getElementById('2411.11045v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://alonzoleeeooo.github.io/StableV2V, code: https://github.com/AlonzoLeeeooo/StableV2V, model weights: https://huggingface.co/AlonzoLeeeooo/StableV2V, dataset (DAVIS-Edit): https://huggingface.co/datasets/AlonzoLeeeooo/DAVIS-Edit</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09887">arXiv:2411.09887</a> <span> [<a href="https://arxiv.org/pdf/2411.09887">pdf</a>, <a href="https://arxiv.org/format/2411.09887">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Planning by Simulation: Motion Planning with Learning-based Parallel Scenario Prediction for Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Niu%2C+T">Tian Niu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kaizhao Zhang</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+Z">Zhongxue Gan</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+W">Wenchao Ding</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09887v1-abstract-short" style="display: inline;"> Planning safe trajectories for autonomous vehicles is essential for operational safety but remains extremely challenging due to the complex interactions among traffic participants. Recent autonomous driving frameworks have focused on improving prediction accuracy to explicitly model these interactions. However, some methods overlook the significant influence of the ego vehicle's planning on the po… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09887v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09887v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09887v1-abstract-full" style="display: none;"> Planning safe trajectories for autonomous vehicles is essential for operational safety but remains extremely challenging due to the complex interactions among traffic participants. Recent autonomous driving frameworks have focused on improving prediction accuracy to explicitly model these interactions. However, some methods overlook the significant influence of the ego vehicle's planning on the possible trajectories of other agents, which can alter prediction accuracy and lead to unsafe planning decisions. In this paper, we propose a novel motion Planning approach by Simulation with learning-based parallel scenario prediction (PS). PS deduces predictions iteratively based on Monte Carlo Tree Search (MCTS), jointly inferring scenarios that cooperate with the ego vehicle's planning set. Our method simulates possible scenes and calculates their costs after the ego vehicle executes potential actions. To balance and prune unreasonable actions and scenarios, we adopt MCTS as the foundation to explore possible future interactions encoded within the prediction network. Moreover, the query-centric trajectory prediction streamlines our scene generation, enabling a sophisticated framework that captures the mutual influence between other agents' predictions and the ego vehicle's planning. We evaluate our framework on the Argoverse 2 dataset, and the results demonstrate that our approach effectively achieves parallel ego vehicle planning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09887v1-abstract-full').style.display = 'none'; document.getElementById('2411.09887v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09167">arXiv:2411.09167</a> <span> [<a href="https://arxiv.org/pdf/2411.09167">pdf</a>, <a href="https://arxiv.org/format/2411.09167">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Robust AI-Synthesized Speech Detection Using Feature Decomposition Learning and Synthesizer Feature Augmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kuiyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+Z">Zhongyun Hua</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yushu Zhang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yifang Guo</a>, <a href="/search/cs?searchtype=author&query=Xiang%2C+T">Tao Xiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09167v1-abstract-short" style="display: inline;"> AI-synthesized speech, also known as deepfake speech, has recently raised significant concerns due to the rapid advancement of speech synthesis and speech conversion techniques. Previous works often rely on distinguishing synthesizer artifacts to identify deepfake speech. However, excessive reliance on these specific synthesizer artifacts may result in unsatisfactory performance when addressing sp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09167v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09167v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09167v1-abstract-full" style="display: none;"> AI-synthesized speech, also known as deepfake speech, has recently raised significant concerns due to the rapid advancement of speech synthesis and speech conversion techniques. Previous works often rely on distinguishing synthesizer artifacts to identify deepfake speech. However, excessive reliance on these specific synthesizer artifacts may result in unsatisfactory performance when addressing speech signals created by unseen synthesizers. In this paper, we propose a robust deepfake speech detection method that employs feature decomposition to learn synthesizer-independent content features as complementary for detection. Specifically, we propose a dual-stream feature decomposition learning strategy that decomposes the learned speech representation using a synthesizer stream and a content stream. The synthesizer stream specializes in learning synthesizer features through supervised training with synthesizer labels. Meanwhile, the content stream focuses on learning synthesizer-independent content features, enabled by a pseudo-labeling-based supervised learning method. This method randomly transforms speech to generate speed and compression labels for training. Additionally, we employ an adversarial learning technique to reduce the synthesizer-related components in the content stream. The final classification is determined by concatenating the synthesizer and content features. To enhance the model's robustness to different synthesizer characteristics, we further propose a synthesizer feature augmentation strategy that randomly blends the characteristic styles within real and fake audio features and randomly shuffles the synthesizer features with the content features. This strategy effectively enhances the feature diversity and simulates more feature combinations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09167v1-abstract-full').style.display = 'none'; document.getElementById('2411.09167v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08378">arXiv:2411.08378</a> <span> [<a href="https://arxiv.org/pdf/2411.08378">pdf</a>, <a href="https://arxiv.org/format/2411.08378">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Physics Informed Distillation for Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tee%2C+J+T+J">Joshua Tian Jin Tee</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kang Zhang</a>, <a href="/search/cs?searchtype=author&query=Yoon%2C+H+S">Hee Suk Yoon</a>, <a href="/search/cs?searchtype=author&query=Gowda%2C+D+N">Dhananjaya Nagaraja Gowda</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+C">Chanwoo Kim</a>, <a href="/search/cs?searchtype=author&query=Yoo%2C+C+D">Chang D. Yoo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08378v1-abstract-short" style="display: inline;"> Diffusion models have recently emerged as a potent tool in generative modeling. However, their inherent iterative nature often results in sluggish image generation due to the requirement for multiple model evaluations. Recent progress has unveiled the intrinsic link between diffusion models and Probability Flow Ordinary Differential Equations (ODEs), thus enabling us to conceptualize diffusion mod… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08378v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08378v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08378v1-abstract-full" style="display: none;"> Diffusion models have recently emerged as a potent tool in generative modeling. However, their inherent iterative nature often results in sluggish image generation due to the requirement for multiple model evaluations. Recent progress has unveiled the intrinsic link between diffusion models and Probability Flow Ordinary Differential Equations (ODEs), thus enabling us to conceptualize diffusion models as ODE systems. Simultaneously, Physics Informed Neural Networks (PINNs) have substantiated their effectiveness in solving intricate differential equations through implicit modeling of their solutions. Building upon these foundational insights, we introduce Physics Informed Distillation (PID), which employs a student model to represent the solution of the ODE system corresponding to the teacher diffusion model, akin to the principles employed in PINNs. Through experiments on CIFAR 10 and ImageNet 64x64, we observe that PID achieves performance comparable to recent distillation methods. Notably, it demonstrates predictable trends concerning method-specific hyperparameters and eliminates the need for synthetic dataset generation during the distillation process. Both of which contribute to its easy-to-use nature as a distillation approach for Diffusion Models. Our code and pre-trained checkpoint are publicly available at: https://github.com/pantheon5100/pid_diffusion.git. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08378v1-abstract-full').style.display = 'none'; document.getElementById('2411.08378v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07445">arXiv:2411.07445</a> <span> [<a href="https://arxiv.org/pdf/2411.07445">pdf</a>, <a href="https://arxiv.org/format/2411.07445">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> All-in-one Weather-degraded Image Restoration via Adaptive Degradation-aware Self-prompting Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wen%2C+Y">Yuanbo Wen</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+T">Tao Gao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Ziqi Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jing Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kaihao Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Ting Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07445v1-abstract-short" style="display: inline;"> Existing approaches for all-in-one weather-degraded image restoration suffer from inefficiencies in leveraging degradation-aware priors, resulting in sub-optimal performance in adapting to different weather conditions. To this end, we develop an adaptive degradation-aware self-prompting model (ADSM) for all-in-one weather-degraded image restoration. Specifically, our model employs the contrastive… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07445v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07445v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07445v1-abstract-full" style="display: none;"> Existing approaches for all-in-one weather-degraded image restoration suffer from inefficiencies in leveraging degradation-aware priors, resulting in sub-optimal performance in adapting to different weather conditions. To this end, we develop an adaptive degradation-aware self-prompting model (ADSM) for all-in-one weather-degraded image restoration. Specifically, our model employs the contrastive language-image pre-training model (CLIP) to facilitate the training of our proposed latent prompt generators (LPGs), which represent three types of latent prompts to characterize the degradation type, degradation property and image caption. Moreover, we integrate the acquired degradation-aware prompts into the time embedding of diffusion model to improve degradation perception. Meanwhile, we employ the latent caption prompt to guide the reverse sampling process using the cross-attention mechanism, thereby guiding the accurate image reconstruction. Furthermore, to accelerate the reverse sampling procedure of diffusion model and address the limitations of frequency perception, we introduce a wavelet-oriented noise estimating network (WNE-Net). Extensive experiments conducted on eight publicly available datasets demonstrate the effectiveness of our proposed approach in both task-specific and all-in-one applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07445v1-abstract-full').style.display = 'none'; document.getElementById('2411.07445v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06559">arXiv:2411.06559</a> <span> [<a href="https://arxiv.org/pdf/2411.06559">pdf</a>, <a href="https://arxiv.org/format/2411.06559">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Is Your LLM Secretly a World Model of the Internet? Model-Based Planning for Web Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yu Gu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+B">Boyuan Zheng</a>, <a href="/search/cs?searchtype=author&query=Gou%2C+B">Boyu Gou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+C">Cheng Chang</a>, <a href="/search/cs?searchtype=author&query=Srivastava%2C+S">Sanjari Srivastava</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yanan Xie</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+P">Peng Qi</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Huan Sun</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yu Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06559v1-abstract-short" style="display: inline;"> Language agents have demonstrated promising capabilities in automating web-based tasks, though their current reactive approaches still underperform largely compared to humans. While incorporating advanced planning algorithms, particularly tree search methods, could enhance these agents' performance, implementing tree search directly on live websites poses significant safety risks and practical con… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06559v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06559v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06559v1-abstract-full" style="display: none;"> Language agents have demonstrated promising capabilities in automating web-based tasks, though their current reactive approaches still underperform largely compared to humans. While incorporating advanced planning algorithms, particularly tree search methods, could enhance these agents' performance, implementing tree search directly on live websites poses significant safety risks and practical constraints due to irreversible actions such as confirming a purchase. In this paper, we introduce a novel paradigm that augments language agents with model-based planning, pioneering the innovative use of large language models (LLMs) as world models in complex web environments. Our method, WebDreamer, builds on the key insight that LLMs inherently encode comprehensive knowledge about website structures and functionalities. Specifically, WebDreamer uses LLMs to simulate outcomes for each candidate action (e.g., "what would happen if I click this button?") using natural language descriptions, and then evaluates these imagined outcomes to determine the optimal action at each step. Empirical results on two representative web agent benchmarks with online interaction -- VisualWebArena and Mind2Web-live -- demonstrate that WebDreamer achieves substantial improvements over reactive baselines. By establishing the viability of LLMs as world models in web environments, this work lays the groundwork for a paradigm shift in automated web interaction. More broadly, our findings open exciting new avenues for future research into 1) optimizing LLMs specifically for world modeling in complex, dynamic environments, and 2) model-based speculative planning for language agents. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06559v1-abstract-full').style.display = 'none'; document.getElementById('2411.06559v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 6 figures, 4 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06518">arXiv:2411.06518</a> <span> [<a href="https://arxiv.org/pdf/2411.06518">pdf</a>, <a href="https://arxiv.org/format/2411.06518">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Methodology">stat.ME</span> </div> </div> <p class="title is-5 mathjax"> Causal Representation Learning from Multimodal Biological Observations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yuewen Sun</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+L">Lingjing Kong</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guangyi Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Loka Li</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+G">Gongxu Luo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zijian Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yixuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yujia Zheng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Mengyue Yang</a>, <a href="/search/cs?searchtype=author&query=Stojanov%2C+P">Petar Stojanov</a>, <a href="/search/cs?searchtype=author&query=Segal%2C+E">Eran Segal</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+E+P">Eric P. Xing</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06518v1-abstract-short" style="display: inline;"> Prevalent in biological applications (e.g., human phenotype measurements), multimodal datasets can provide valuable insights into the underlying biological mechanisms. However, current machine learning models designed to analyze such datasets still lack interpretability and theoretical guarantees, which are essential to biological applications. Recent advances in causal representation learning hav… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06518v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06518v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06518v1-abstract-full" style="display: none;"> Prevalent in biological applications (e.g., human phenotype measurements), multimodal datasets can provide valuable insights into the underlying biological mechanisms. However, current machine learning models designed to analyze such datasets still lack interpretability and theoretical guarantees, which are essential to biological applications. Recent advances in causal representation learning have shown promise in uncovering the interpretable latent causal variables with formal theoretical certificates. Unfortunately, existing works for multimodal distributions either rely on restrictive parametric assumptions or provide rather coarse identification results, limiting their applicability to biological research which favors a detailed understanding of the mechanisms. In this work, we aim to develop flexible identification conditions for multimodal data and principled methods to facilitate the understanding of biological datasets. Theoretically, we consider a flexible nonparametric latent distribution (c.f., parametric assumptions in prior work) permitting causal relationships across potentially different modalities. We establish identifiability guarantees for each latent component, extending the subspace identification results from prior work. Our key theoretical ingredient is the structural sparsity of the causal connections among distinct modalities, which, as we will discuss, is natural for a large collection of biological systems. Empirically, we propose a practical framework to instantiate our theoretical insights. We demonstrate the effectiveness of our approach through extensive experiments on both numerical and synthetic datasets. Results on a real-world human phenotype dataset are consistent with established medical research, validating our theoretical and methodological framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06518v1-abstract-full').style.display = 'none'; document.getElementById('2411.06518v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06096">arXiv:2411.06096</a> <span> [<a href="https://arxiv.org/pdf/2411.06096">pdf</a>, <a href="https://arxiv.org/format/2411.06096">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> ZhoBLiMP: a Systematic Assessment of Language Models with Linguistic Minimal Pairs in Chinese </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yikang Liu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yeting Shen</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Hongao Zhu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+L">Lilong Xu</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+Z">Zhiheng Qian</a>, <a href="/search/cs?searchtype=author&query=Song%2C+S">Siyuan Song</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kejia Zhang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jialong Tang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Pei Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+B">Baosong Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Rui Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Hai Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06096v1-abstract-short" style="display: inline;"> Whether and how language models (LMs) acquire the syntax of natural languages has been widely evaluated under the minimal pair paradigm. However, a lack of wide-coverage benchmarks in languages other than English has constrained systematic investigations into the issue. Addressing it, we first introduce ZhoBLiMP, the most comprehensive benchmark of linguistic minimal pairs for Chinese to date, wit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06096v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06096v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06096v1-abstract-full" style="display: none;"> Whether and how language models (LMs) acquire the syntax of natural languages has been widely evaluated under the minimal pair paradigm. However, a lack of wide-coverage benchmarks in languages other than English has constrained systematic investigations into the issue. Addressing it, we first introduce ZhoBLiMP, the most comprehensive benchmark of linguistic minimal pairs for Chinese to date, with 118 paradigms, covering 15 linguistic phenomena. We then train 20 LMs of different sizes (14M to 1.4B) on Chinese corpora of various volumes (100M to 3B tokens) and evaluate them along with 14 off-the-shelf LLMs on ZhoBLiMP. The overall results indicate that Chinese grammar can be mostly learned by models with around 500M parameters, trained on 1B tokens with one epoch, showing limited benefits for further scaling. Most (N=95) linguistic paradigms are of easy or medium difficulty for LMs, while there are still 13 paradigms that remain challenging even for models with up to 32B parameters. In regard to how LMs acquire Chinese grammar, we observe a U-shaped learning pattern in several phenomena, similar to those observed in child language acquisition. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06096v1-abstract-full').style.display = 'none'; document.getElementById('2411.06096v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04788">arXiv:2411.04788</a> <span> [<a href="https://arxiv.org/pdf/2411.04788">pdf</a>, <a href="https://arxiv.org/format/2411.04788">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Statistical Finance">q-fin.ST</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Trading and Market Microstructure">q-fin.TR</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Investment Analysis: Optimizing AI-Agent Collaboration in Financial Research </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+X">Xuewen Han</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+N">Neng Wang</a>, <a href="/search/cs?searchtype=author&query=Che%2C+S">Shangkun Che</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hongyang Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kunpeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+S+X">Sean Xin Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04788v1-abstract-short" style="display: inline;"> In recent years, the application of generative artificial intelligence (GenAI) in financial analysis and investment decision-making has gained significant attention. However, most existing approaches rely on single-agent systems, which fail to fully utilize the collaborative potential of multiple AI agents. In this paper, we propose a novel multi-agent collaboration system designed to enhance deci… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04788v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04788v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04788v1-abstract-full" style="display: none;"> In recent years, the application of generative artificial intelligence (GenAI) in financial analysis and investment decision-making has gained significant attention. However, most existing approaches rely on single-agent systems, which fail to fully utilize the collaborative potential of multiple AI agents. In this paper, we propose a novel multi-agent collaboration system designed to enhance decision-making in financial investment research. The system incorporates agent groups with both configurable group sizes and collaboration structures to leverage the strengths of each agent group type. By utilizing a sub-optimal combination strategy, the system dynamically adapts to varying market conditions and investment scenarios, optimizing performance across different tasks. We focus on three sub-tasks: fundamentals, market sentiment, and risk analysis, by analyzing the 2023 SEC 10-K forms of 30 companies listed on the Dow Jones Index. Our findings reveal significant performance variations based on the configurations of AI agents for different tasks. The results demonstrate that our multi-agent collaboration system outperforms traditional single-agent models, offering improved accuracy, efficiency, and adaptability in complex financial environments. This study highlights the potential of multi-agent systems in transforming financial analysis and investment decision-making by integrating diverse analytical perspectives. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04788v1-abstract-full').style.display = 'none'; document.getElementById('2411.04788v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04558">arXiv:2411.04558</a> <span> [<a href="https://arxiv.org/pdf/2411.04558">pdf</a>, <a href="https://arxiv.org/format/2411.04558">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Experimental Secure Multiparty Computation from Quantum Oblivious Transfer with Bit Commitment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai-Yi Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+A">An-Jing Huang</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+K">Kun Tu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Ming-Han Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chi Zhang</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+W">Wei Qi</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Ya-Dong Wu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yu Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04558v1-abstract-short" style="display: inline;"> Secure multiparty computation enables collaborative computations across multiple users while preserving individual privacy, which has a wide range of applications in finance, machine learning and healthcare. Secure multiparty computation can be realized using oblivious transfer as a primitive function. In this paper, we present an experimental implementation of a quantum-secure quantum oblivious t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04558v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04558v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04558v1-abstract-full" style="display: none;"> Secure multiparty computation enables collaborative computations across multiple users while preserving individual privacy, which has a wide range of applications in finance, machine learning and healthcare. Secure multiparty computation can be realized using oblivious transfer as a primitive function. In this paper, we present an experimental implementation of a quantum-secure quantum oblivious transfer (QOT) protocol using an adapted quantum key distribution system combined with a bit commitment scheme, surpassing previous approaches only secure in the noisy storage model. We demonstrate the first practical application of the QOT protocol by solving the private set intersection, a prime example of secure multiparty computation, where two parties aim to find common elements in their datasets without revealing any other information. In our experiments, two banks can identify common suspicious accounts without disclosing any other data. This not only proves the experimental functionality of QOT, but also showcases its real-world commercial applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04558v1-abstract-full').style.display = 'none'; document.getElementById('2411.04558v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03910">arXiv:2411.03910</a> <span> [<a href="https://arxiv.org/pdf/2411.03910">pdf</a>, <a href="https://arxiv.org/ps/2411.03910">ps</a>, <a href="https://arxiv.org/format/2411.03910">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> WiP: Towards a Secure SECP256K1 for Crypto Wallets: Hardware Architecture and Implementation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lemayian%2C+J+P">Joel Poncha Lemayian</a>, <a href="/search/cs?searchtype=author&query=Gagnon%2C+G">Ghyslain Gagnon</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kaiwen Zhang</a>, <a href="/search/cs?searchtype=author&query=Giard%2C+P">Pascal Giard</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03910v1-abstract-short" style="display: inline;"> The SECP256K1 elliptic curve algorithm is fundamental in cryptocurrency wallets for generating secure public keys from private keys, thereby ensuring the protection and ownership of blockchain-based digital assets. However, the literature highlights several successful side-channel attacks on hardware wallets that exploit SECP256K1 to extract private keys. This work proposes a novel hardware archit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03910v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03910v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03910v1-abstract-full" style="display: none;"> The SECP256K1 elliptic curve algorithm is fundamental in cryptocurrency wallets for generating secure public keys from private keys, thereby ensuring the protection and ownership of blockchain-based digital assets. However, the literature highlights several successful side-channel attacks on hardware wallets that exploit SECP256K1 to extract private keys. This work proposes a novel hardware architecture for SECP256K1, optimized for side-channel attack resistance and efficient resource utilization. The architecture incorporates complete addition formulas, temporary registers, and parallel processing techniques, making elliptic curve point addition and doubling operations indistinguishable. Implementation results demonstrate an average reduction of 45% in LUT usage compared to similar works, emphasizing the design's resource efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03910v1-abstract-full').style.display = 'none'; document.getElementById('2411.03910v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Presented at HASP 2024 @ MICRO 2024 https://haspworkshop.org/2024/program.html</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03743">arXiv:2411.03743</a> <span> [<a href="https://arxiv.org/pdf/2411.03743">pdf</a>, <a href="https://arxiv.org/format/2411.03743">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> </div> </div> <p class="title is-5 mathjax"> Automating Exploratory Proteomics Research via Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ding%2C+N">Ning Ding</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+S">Shang Qu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+L">Linhai Xie</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yifei Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zaoqu Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kaiyan Zhang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yibai Xiong</a>, <a href="/search/cs?searchtype=author&query=Zuo%2C+Y">Yuxin Zuo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhangren Chen</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+E">Ermo Hua</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+X">Xingtai Lv</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Youbang Sun</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yang Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dong Li</a>, <a href="/search/cs?searchtype=author&query=He%2C+F">Fuchu He</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+B">Bowen Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03743v1-abstract-short" style="display: inline;"> With the development of artificial intelligence, its contribution to science is evolving from simulating a complex problem to automating entire research processes and producing novel discoveries. Achieving this advancement requires both specialized general models grounded in real-world scientific data and iterative, exploratory frameworks that mirror human scientific methodologies. In this paper,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03743v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03743v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03743v1-abstract-full" style="display: none;"> With the development of artificial intelligence, its contribution to science is evolving from simulating a complex problem to automating entire research processes and producing novel discoveries. Achieving this advancement requires both specialized general models grounded in real-world scientific data and iterative, exploratory frameworks that mirror human scientific methodologies. In this paper, we present PROTEUS, a fully automated system for scientific discovery from raw proteomics data. PROTEUS uses large language models (LLMs) to perform hierarchical planning, execute specialized bioinformatics tools, and iteratively refine analysis workflows to generate high-quality scientific hypotheses. The system takes proteomics datasets as input and produces a comprehensive set of research objectives, analysis results, and novel biological hypotheses without human intervention. We evaluated PROTEUS on 12 proteomics datasets collected from various biological samples (e.g. immune cells, tumors) and different sample types (single-cell and bulk), generating 191 scientific hypotheses. These were assessed using both automatic LLM-based scoring on 5 metrics and detailed reviews from human experts. Results demonstrate that PROTEUS consistently produces reliable, logically coherent results that align well with existing literature while also proposing novel, evaluable hypotheses. The system's flexible architecture facilitates seamless integration of diverse analysis tools and adaptation to different proteomics data types. By automating complex proteomics analysis workflows and hypothesis generation, PROTEUS has the potential to considerably accelerate the pace of scientific discovery in proteomics research, enabling researchers to efficiently explore large-scale datasets and uncover biological insights. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03743v1-abstract-full').style.display = 'none'; document.getElementById('2411.03743v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03731">arXiv:2411.03731</a> <span> [<a href="https://arxiv.org/pdf/2411.03731">pdf</a>, <a href="https://arxiv.org/format/2411.03731">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Reducing Hyperparameter Tuning Costs in ML, Vision and Language Model Training Pipelines via Memoization-Awareness </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Essofi%2C+A">Abdelmajid Essofi</a>, <a href="/search/cs?searchtype=author&query=Salahuddeen%2C+R">Ridwan Salahuddeen</a>, <a href="/search/cs?searchtype=author&query=Nwadike%2C+M">Munachiso Nwadike</a>, <a href="/search/cs?searchtype=author&query=Zhalieva%2C+E">Elnura Zhalieva</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kun Zhang</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+E">Eric Xing</a>, <a href="/search/cs?searchtype=author&query=Neiswanger%2C+W">Willie Neiswanger</a>, <a href="/search/cs?searchtype=author&query=Ho%2C+Q">Qirong Ho</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03731v1-abstract-short" style="display: inline;"> The training or fine-tuning of machine learning, vision, and language models is often implemented as a pipeline: a sequence of stages encompassing data preparation, model training and evaluation. In this paper, we exploit pipeline structures to reduce the cost of hyperparameter tuning for model training/fine-tuning, which is particularly valuable for language models given their high costs in GPU-d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03731v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03731v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03731v1-abstract-full" style="display: none;"> The training or fine-tuning of machine learning, vision, and language models is often implemented as a pipeline: a sequence of stages encompassing data preparation, model training and evaluation. In this paper, we exploit pipeline structures to reduce the cost of hyperparameter tuning for model training/fine-tuning, which is particularly valuable for language models given their high costs in GPU-days. We propose a "memoization-aware" Bayesian Optimization (BO) algorithm, EEIPU, that works in tandem with a pipeline caching system, allowing it to evaluate significantly more hyperparameter candidates per GPU-day than other tuning algorithms. The result is better-quality hyperparameters in the same amount of search time, or equivalently, reduced search time to reach the same hyperparameter quality. In our benchmarks on machine learning (model ensembles), vision (convolutional architecture) and language (T5 architecture) pipelines, we compare EEIPU against recent BO algorithms: EEIPU produces an average of $103\%$ more hyperparameter candidates (within the same budget), and increases the validation metric by an average of $108\%$ more than other algorithms (where the increase is measured starting from the end of warm-up iterations). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03731v1-abstract-full').style.display = 'none'; document.getElementById('2411.03731v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03723">arXiv:2411.03723</a> <span> [<a href="https://arxiv.org/pdf/2411.03723">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Zero-shot Dynamic MRI Reconstruction with Global-to-local Diffusion Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guan%2C+Y">Yu Guan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kunlong Zhang</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Q">Qi Qi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Dong Wang</a>, <a href="/search/cs?searchtype=author&query=Ke%2C+Z">Ziwen Ke</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shaoyu Wang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+D">Dong Liang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qiegen Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03723v1-abstract-short" style="display: inline;"> Diffusion models have recently demonstrated considerable advancement in the generation and reconstruction of magnetic resonance imaging (MRI) data. These models exhibit great potential in handling unsampled data and reducing noise, highlighting their promise as generative models. However, their application in dynamic MRI remains relatively underexplored. This is primarily due to the substantial am… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03723v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03723v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03723v1-abstract-full" style="display: none;"> Diffusion models have recently demonstrated considerable advancement in the generation and reconstruction of magnetic resonance imaging (MRI) data. These models exhibit great potential in handling unsampled data and reducing noise, highlighting their promise as generative models. However, their application in dynamic MRI remains relatively underexplored. This is primarily due to the substantial amount of fully-sampled data typically required for training, which is difficult to obtain in dynamic MRI due to its spatio-temporal complexity and high acquisition costs. To address this challenge, we propose a dynamic MRI reconstruction method based on a time-interleaved acquisition scheme, termed the Glob-al-to-local Diffusion Model. Specifically, fully encoded full-resolution reference data are constructed by merging under-sampled k-space data from adjacent time frames, generating two distinct bulk training datasets for global and local models. The global-to-local diffusion framework alternately optimizes global information and local image details, enabling zero-shot reconstruction. Extensive experiments demonstrate that the proposed method performs well in terms of noise reduction and detail preservation, achieving reconstruction quality comparable to that of supervised approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03723v1-abstract-full').style.display = 'none'; document.getElementById('2411.03723v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03129">arXiv:2411.03129</a> <span> [<a href="https://arxiv.org/pdf/2411.03129">pdf</a>, <a href="https://arxiv.org/format/2411.03129">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Biological Physics">physics.bio-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MA^2: A Self-Supervised and Motion Augmenting Autoencoder for Gait-Based Automatic Disease Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yiqun Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Ke Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yin Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03129v1-abstract-short" style="display: inline;"> Ground reaction force (GRF) is the force exerted by the ground on a body in contact with it. GRF-based automatic disease detection (ADD) has become an emerging medical diagnosis method, which aims to learn and identify disease patterns corresponding to different gait pressures based on deep learning methods. Although existing ADD methods can save doctors time in making diagnoses, training deep mod… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03129v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03129v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03129v1-abstract-full" style="display: none;"> Ground reaction force (GRF) is the force exerted by the ground on a body in contact with it. GRF-based automatic disease detection (ADD) has become an emerging medical diagnosis method, which aims to learn and identify disease patterns corresponding to different gait pressures based on deep learning methods. Although existing ADD methods can save doctors time in making diagnoses, training deep models still struggles with the cost caused by the labeling engineering for a large number of gait diagnostic data for subjects. On the other hand, the accuracy of the deep model under the unified benchmark GRF dataset and the generalization ability on scalable gait datasets need to be further improved. To address these issues, we propose MA2, a GRF-based self-supervised and motion augmenting auto-encoder, which models the ADD task as an encoder-decoder paradigm. In the encoder, we introduce an embedding block including the 3-layer 1D convolution for extracting the token and a mask generator to randomly mask out the sequence of tokens to maximize the model's potential to capture high-level, discriminative, intrinsic representations. whereafter, the decoder utilizes this information to reconstruct the pixel sequence of the origin input and calculate the reconstruction loss to optimize the network. Moreover, the backbone of an auto-encoder is multi-head self-attention that can consider the global information of the token from the input, not just the local neighborhood. This allows the model to capture generalized contextual information. Extensive experiments demonstrate MA2 has SOTA performance of 90.91% accuracy on 1% limited pathological GRF samples with labels, and good generalization ability of 78.57% accuracy on scalable Parkinson disease dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03129v1-abstract-full').style.display = 'none'; document.getElementById('2411.03129v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 11 figures, article</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02592">arXiv:2411.02592</a> <span> [<a href="https://arxiv.org/pdf/2411.02592">pdf</a>, <a href="https://arxiv.org/format/2411.02592">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Decoupled Data Augmentation for Improving Image Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+R">Ruoxin Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhe Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Ke-Yue Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Shuang Wu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jiamu Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shouli Wang</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+T">Taiping Yao</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+S">Shouhong Ding</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02592v1-abstract-short" style="display: inline;"> Recent advancements in image mixing and generative data augmentation have shown promise in enhancing image classification. However, these techniques face the challenge of balancing semantic fidelity with diversity. Specifically, image mixing involves interpolating two images to create a new one, but this pixel-level interpolation can compromise fidelity. Generative augmentation uses text-to-image… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02592v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02592v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02592v1-abstract-full" style="display: none;"> Recent advancements in image mixing and generative data augmentation have shown promise in enhancing image classification. However, these techniques face the challenge of balancing semantic fidelity with diversity. Specifically, image mixing involves interpolating two images to create a new one, but this pixel-level interpolation can compromise fidelity. Generative augmentation uses text-to-image generative models to synthesize or modify images, often limiting diversity to avoid generating out-of-distribution data that potentially affects accuracy. We propose that this fidelity-diversity dilemma partially stems from the whole-image paradigm of existing methods. Since an image comprises the class-dependent part (CDP) and the class-independent part (CIP), where each part has fundamentally different impacts on the image's fidelity, treating different parts uniformly can therefore be misleading. To address this fidelity-diversity dilemma, we introduce Decoupled Data Augmentation (De-DA), which resolves the dilemma by separating images into CDPs and CIPs and handling them adaptively. To maintain fidelity, we use generative models to modify real CDPs under controlled conditions, preserving semantic consistency. To enhance diversity, we replace the image's CIP with inter-class variants, creating diverse CDP-CIP combinations. Additionally, we implement an online randomized combination strategy during training to generate numerous distinct CDP-CIP combinations cost-effectively. Comprehensive empirical evaluations validate the effectiveness of our method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02592v1-abstract-full').style.display = 'none'; document.getElementById('2411.02592v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02265">arXiv:2411.02265</a> <span> [<a href="https://arxiv.org/pdf/2411.02265">pdf</a>, <a href="https://arxiv.org/format/2411.02265">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Hunyuan-Large: An Open-Source MoE Model with 52 Billion Activated Parameters by Tencent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xingwu Sun</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yanfeng Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yiqing Huang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+R">Ruobing Xie</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jiaqi Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shuaipeng Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhen Yang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jonny Han</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+X">Xiaobo Shu</a>, <a href="/search/cs?searchtype=author&query=Bu%2C+J">Jiahao Bu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhongzhi Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xuemeng Huang</a>, <a href="/search/cs?searchtype=author&query=Lian%2C+F">Fengzong Lian</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Saiyong Yang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+J">Jianfeng Yan</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Y">Yuyuan Zeng</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+X">Xiaoqin Ren</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+C">Chao Yu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+L">Lulu Wu</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Y">Yue Mao</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+J">Jun Xia</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tao Yang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+S">Suncong Zheng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+K">Kan Wu</a> , et al. (83 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02265v3-abstract-short" style="display: inline;"> In this paper, we introduce Hunyuan-Large, which is currently the largest open-source Transformer-based mixture of experts model, with a total of 389 billion parameters and 52 billion activation parameters, capable of handling up to 256K tokens. We conduct a thorough evaluation of Hunyuan-Large's superior performance across various benchmarks including language understanding and generation, logica… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02265v3-abstract-full').style.display = 'inline'; document.getElementById('2411.02265v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02265v3-abstract-full" style="display: none;"> In this paper, we introduce Hunyuan-Large, which is currently the largest open-source Transformer-based mixture of experts model, with a total of 389 billion parameters and 52 billion activation parameters, capable of handling up to 256K tokens. We conduct a thorough evaluation of Hunyuan-Large's superior performance across various benchmarks including language understanding and generation, logical reasoning, mathematical problem-solving, coding, long-context, and aggregated tasks, where it outperforms LLama3.1-70B and exhibits comparable performance when compared to the significantly larger LLama3.1-405B model. Key practice of Hunyuan-Large include large-scale synthetic data that is orders larger than in previous literature, a mixed expert routing strategy, a key-value cache compression technique, and an expert-specific learning rate strategy. Additionally, we also investigate the scaling laws and learning rate schedule of mixture of experts models, providing valuable insights and guidances for future model development and optimization. The code and checkpoints of Hunyuan-Large are released to facilitate future innovations and applications. Codes: https://github.com/Tencent/Hunyuan-Large Models: https://huggingface.co/tencent/Tencent-Hunyuan-Large <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02265v3-abstract-full').style.display = 'none'; document.getElementById('2411.02265v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 4 Figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02063">arXiv:2411.02063</a> <span> [<a href="https://arxiv.org/pdf/2411.02063">pdf</a>, <a href="https://arxiv.org/format/2411.02063">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Scalable Efficient Training of Large Language Models with Low-dimensional Projected Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lv%2C+X">Xingtai Lv</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+N">Ning Ding</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kaiyan Zhang</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+E">Ermo Hua</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+G">Ganqu Cui</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+B">Bowen Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02063v1-abstract-short" style="display: inline;"> Improving the effectiveness and efficiency of large language models (LLMs) simultaneously is a critical yet challenging research goal. In this paper, we find that low-rank pre-training, normally considered as efficient methods that will compromise performance, can be scalably effective when reduced parameters are precisely targeted. Specifically, applying the low-dimensional module only to the att… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02063v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02063v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02063v1-abstract-full" style="display: none;"> Improving the effectiveness and efficiency of large language models (LLMs) simultaneously is a critical yet challenging research goal. In this paper, we find that low-rank pre-training, normally considered as efficient methods that will compromise performance, can be scalably effective when reduced parameters are precisely targeted. Specifically, applying the low-dimensional module only to the attention layer -- resolves this issue and enhances both effectiveness and efficiency. We refer to this structure as Low-dimensional Projected Attention (LPA) and provide an explanatory analysis. Through extensive experimentation at parameter scales of 130M, 370M, and scaling up to 3B, we have validated the effectiveness and scalability of LPA. Our results show that LPA model can save up to 12.4% in time while achieving an approximate 5% improvement in test perplexity (ppl) and on downstream tasks compared with the vanilla Transformer. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02063v1-abstract-full').style.display = 'none'; document.getElementById('2411.02063v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to EMNLP 2024 (Main Conference)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23862">arXiv:2410.23862</a> <span> [<a href="https://arxiv.org/pdf/2410.23862">pdf</a>, <a href="https://arxiv.org/format/2410.23862">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> $蠄$DAG: Projected Stochastic Approximation Iteration for DAG Structure Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ziu%2C+K">Klea Ziu</a>, <a href="/search/cs?searchtype=author&query=Hanzely%2C+S">Slavom铆r Hanzely</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Loka Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kun Zhang</a>, <a href="/search/cs?searchtype=author&query=Tak%C3%A1%C4%8D%2C+M">Martin Tak谩膷</a>, <a href="/search/cs?searchtype=author&query=Kamzolov%2C+D">Dmitry Kamzolov</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23862v1-abstract-short" style="display: inline;"> Learning the structure of Directed Acyclic Graphs (DAGs) presents a significant challenge due to the vast combinatorial search space of possible graphs, which scales exponentially with the number of nodes. Recent advancements have redefined this problem as a continuous optimization task by incorporating differentiable acyclicity constraints. These methods commonly rely on algebraic characterizatio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23862v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23862v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23862v1-abstract-full" style="display: none;"> Learning the structure of Directed Acyclic Graphs (DAGs) presents a significant challenge due to the vast combinatorial search space of possible graphs, which scales exponentially with the number of nodes. Recent advancements have redefined this problem as a continuous optimization task by incorporating differentiable acyclicity constraints. These methods commonly rely on algebraic characterizations of DAGs, such as matrix exponentials, to enable the use of gradient-based optimization techniques. Despite these innovations, existing methods often face optimization difficulties due to the highly non-convex nature of DAG constraints and the per-iteration computational complexity. In this work, we present a novel framework for learning DAGs, employing a Stochastic Approximation approach integrated with Stochastic Gradient Descent (SGD)-based optimization techniques. Our framework introduces new projection methods tailored to efficiently enforce DAG constraints, ensuring that the algorithm converges to a feasible local minimum. With its low iteration complexity, the proposed method is well-suited for handling large-scale problems with improved computational efficiency. We demonstrate the effectiveness and scalability of our framework through comprehensive experimental evaluations, which confirm its superior performance across various settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23862v1-abstract-full').style.display = 'none'; document.getElementById('2410.23862v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22844">arXiv:2410.22844</a> <span> [<a href="https://arxiv.org/pdf/2410.22844">pdf</a>, <a href="https://arxiv.org/format/2410.22844">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Understanding and Improving Adversarial Collaborative Filtering for Robust Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kaike Zhang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Q">Qi Cao</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yunfan Wu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+F">Fei Sun</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+H">Huawei Shen</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xueqi Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22844v2-abstract-short" style="display: inline;"> Adversarial Collaborative Filtering (ACF), which typically applies adversarial perturbations at user and item embeddings through adversarial training, is widely recognized as an effective strategy for enhancing the robustness of Collaborative Filtering (CF) recommender systems against poisoning attacks. Besides, numerous studies have empirically shown that ACF can also improve recommendation perfo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22844v2-abstract-full').style.display = 'inline'; document.getElementById('2410.22844v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22844v2-abstract-full" style="display: none;"> Adversarial Collaborative Filtering (ACF), which typically applies adversarial perturbations at user and item embeddings through adversarial training, is widely recognized as an effective strategy for enhancing the robustness of Collaborative Filtering (CF) recommender systems against poisoning attacks. Besides, numerous studies have empirically shown that ACF can also improve recommendation performance compared to traditional CF. Despite these empirical successes, the theoretical understanding of ACF's effectiveness in terms of both performance and robustness remains unclear. To bridge this gap, in this paper, we first theoretically show that ACF can achieve a lower recommendation error compared to traditional CF with the same training epochs in both clean and poisoned data contexts. Furthermore, by establishing bounds for reductions in recommendation error during ACF's optimization process, we find that applying personalized magnitudes of perturbation for different users based on their embedding scales can further improve ACF's effectiveness. Building on these theoretical understandings, we propose Personalized Magnitude Adversarial Collaborative Filtering (PamaCF). Extensive experiments demonstrate that PamaCF effectively defends against various types of poisoning attacks while significantly enhancing recommendation performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22844v2-abstract-full').style.display = 'none'; document.getElementById('2410.22844v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">To appear in NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22480">arXiv:2410.22480</a> <span> [<a href="https://arxiv.org/pdf/2410.22480">pdf</a>, <a href="https://arxiv.org/format/2410.22480">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Scaling LLM Inference with Optimized Sample Compute Allocation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kexun Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+S">Shang Zhou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Danqing Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W+Y">William Yang Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lei Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22480v1-abstract-short" style="display: inline;"> Sampling is a basic operation in many inference-time algorithms of large language models (LLMs). To scale up inference efficiently with a limited compute, it is crucial to find an optimal allocation for sample compute budgets: Which sampling configurations (model, temperature, language, etc.) do we use? How many samples do we generate in each configuration? We formulate these choices as a learning… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22480v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22480v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22480v1-abstract-full" style="display: none;"> Sampling is a basic operation in many inference-time algorithms of large language models (LLMs). To scale up inference efficiently with a limited compute, it is crucial to find an optimal allocation for sample compute budgets: Which sampling configurations (model, temperature, language, etc.) do we use? How many samples do we generate in each configuration? We formulate these choices as a learning problem and propose OSCA, an algorithm that Optimizes Sample Compute Allocation by finding an optimal mix of different inference configurations. Our experiments show that with our learned mixed allocation, we can achieve accuracy better than the best single configuration with 128x less compute on code generation and 25x less compute on 4 reasoning tasks. OSCA is also shown to be effective in agentic workflows beyond single-turn tasks, achieving a better accuracy on SWE-Bench with 3x less compute than the default configuration. Our code and generations are released at https://github.com/LeiLiLab/OSCA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22480v1-abstract-full').style.display = 'none'; document.getElementById('2410.22480v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22394">arXiv:2410.22394</a> <span> [<a href="https://arxiv.org/pdf/2410.22394">pdf</a>, <a href="https://arxiv.org/format/2410.22394">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> AAAR-1.0: Assessing AI's Potential to Assist Research </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lou%2C+R">Renze Lou</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Hanzi Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Sijia Wang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+J">Jiangshu Du</a>, <a href="/search/cs?searchtype=author&query=Kamoi%2C+R">Ryo Kamoi</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+X">Xiaoxin Lu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+J">Jian Xie</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yuxuan Sun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yusen Zhang</a>, <a href="/search/cs?searchtype=author&query=Ahn%2C+J+J">Jihyun Janice Ahn</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+H">Hongchao Fang</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+Z">Zhuoyang Zou</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+W">Wenchao Ma</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xi Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+C">Congying Xia</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+L">Lifu Huang</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+W">Wenpeng Yin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22394v1-abstract-short" style="display: inline;"> Numerous studies have assessed the proficiency of AI systems, particularly large language models (LLMs), in facilitating everyday tasks such as email writing, question answering, and creative content generation. However, researchers face unique challenges and opportunities in leveraging LLMs for their own work, such as brainstorming research ideas, designing experiments, and writing or reviewing p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22394v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22394v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22394v1-abstract-full" style="display: none;"> Numerous studies have assessed the proficiency of AI systems, particularly large language models (LLMs), in facilitating everyday tasks such as email writing, question answering, and creative content generation. However, researchers face unique challenges and opportunities in leveraging LLMs for their own work, such as brainstorming research ideas, designing experiments, and writing or reviewing papers. In this study, we introduce AAAR-1.0, a benchmark dataset designed to evaluate LLM performance in three fundamental, expertise-intensive research tasks: (i) EquationInference, assessing the correctness of equations based on the contextual information in paper submissions; (ii) ExperimentDesign, designing experiments to validate research ideas and solutions; (iii) PaperWeakness, identifying weaknesses in paper submissions; and (iv) REVIEWCRITIQUE, identifying each segment in human reviews is deficient or not. AAAR-1.0 differs from prior benchmarks in two key ways: first, it is explicitly research-oriented, with tasks requiring deep domain expertise; second, it is researcher-oriented, mirroring the primary activities that researchers engage in on a daily basis. An evaluation of both open-source and proprietary LLMs reveals their potential as well as limitations in conducting sophisticated research tasks. We will keep iterating AAAR-1.0 to new versions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22394v1-abstract-full').style.display = 'none'; document.getElementById('2410.22394v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Webpage: https://renzelou.github.io/AAAR-1.0/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21616">arXiv:2410.21616</a> <span> [<a href="https://arxiv.org/pdf/2410.21616">pdf</a>, <a href="https://arxiv.org/format/2410.21616">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Identifying Selections for Unsupervised Subtask Discovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qiu%2C+Y">Yiwen Qiu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yujia Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21616v1-abstract-short" style="display: inline;"> When solving long-horizon tasks, it is intriguing to decompose the high-level task into subtasks. Decomposing experiences into reusable subtasks can improve data efficiency, accelerate policy generalization, and in general provide promising solutions to multi-task reinforcement learning and imitation learning problems. However, the concept of subtasks is not sufficiently understood and modeled yet… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21616v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21616v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21616v1-abstract-full" style="display: none;"> When solving long-horizon tasks, it is intriguing to decompose the high-level task into subtasks. Decomposing experiences into reusable subtasks can improve data efficiency, accelerate policy generalization, and in general provide promising solutions to multi-task reinforcement learning and imitation learning problems. However, the concept of subtasks is not sufficiently understood and modeled yet, and existing works often overlook the true structure of the data generation process: subtasks are the results of a $\textit{selection}$ mechanism on actions, rather than possible underlying confounders or intermediates. Specifically, we provide a theory to identify, and experiments to verify the existence of selection variables in such data. These selections serve as subgoals that indicate subtasks and guide policy. In light of this idea, we develop a sequential non-negative matrix factorization (seq- NMF) method to learn these subgoals and extract meaningful behavior patterns as subtasks. Our empirical results on a challenging Kitchen environment demonstrate that the learned subtasks effectively enhance the generalization to new tasks in multi-task imitation learning scenarios. The codes are provided at https://anonymous.4open.science/r/Identifying\_Selections\_for\_Unsupervised\_Subtask\_Discovery/README.md. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21616v1-abstract-full').style.display = 'none'; document.getElementById('2410.21616v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21282">arXiv:2410.21282</a> <span> [<a href="https://arxiv.org/pdf/2410.21282">pdf</a>, <a href="https://arxiv.org/format/2410.21282">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Logic Error Localization in Student Programming Assignments Using Pseudocode and Graph Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhenyu Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kun Zhang</a>, <a href="/search/cs?searchtype=author&query=Sheng%2C+V+S">Victor S. Sheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21282v1-abstract-short" style="display: inline;"> Pseudocode is extensively used in introductory programming courses to instruct computer science students in algorithm design, utilizing natural language to define algorithmic behaviors. This learning approach enables students to convert pseudocode into source code and execute it to verify their algorithms' correctness. This process typically introduces two types of errors: syntax errors and logic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21282v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21282v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21282v1-abstract-full" style="display: none;"> Pseudocode is extensively used in introductory programming courses to instruct computer science students in algorithm design, utilizing natural language to define algorithmic behaviors. This learning approach enables students to convert pseudocode into source code and execute it to verify their algorithms' correctness. This process typically introduces two types of errors: syntax errors and logic errors. Syntax errors are often accompanied by compiler feedback, which helps students identify incorrect lines. In contrast, logic errors are more challenging because they do not trigger compiler errors and lack immediate diagnostic feedback, making them harder to detect and correct. To address this challenge, we developed a system designed to localize logic errors within student programming assignments at the line level. Our approach utilizes pseudocode as a scaffold to build a code-pseudocode graph, connecting symbols from the source code to their pseudocode counterparts. We then employ a graph neural network to both localize and suggest corrections for logic errors. Additionally, we have devised a method to efficiently gather logic-error-prone programs during the syntax error correction process and compile these into a dataset that includes single and multiple line logic errors, complete with indices of the erroneous lines. Our experimental results are promising, demonstrating a localization accuracy of 99.2% for logic errors within the top-10 suspected lines, highlighting the effectiveness of our approach in enhancing students' coding proficiency and error correction skills. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21282v1-abstract-full').style.display = 'none'; document.getElementById('2410.21282v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20766">arXiv:2410.20766</a> <span> [<a href="https://arxiv.org/pdf/2410.20766">pdf</a>, <a href="https://arxiv.org/format/2410.20766">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3522763">10.1145/3522763 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> A Static and Dynamic Attention Framework for Multi Turn Dialogue Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wei-Nan Zhang</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+Y">Yiming Cui</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kaiyan Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yifa Wang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Q">Qingfu Zhu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lingzhi Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Ting Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20766v1-abstract-short" style="display: inline;"> Recently, research on open domain dialogue systems have attracted extensive interests of academic and industrial researchers. The goal of an open domain dialogue system is to imitate humans in conversations. Previous works on single turn conversation generation have greatly promoted the research of open domain dialogue systems. However, understanding multiple single turn conversations is not equal… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20766v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20766v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20766v1-abstract-full" style="display: none;"> Recently, research on open domain dialogue systems have attracted extensive interests of academic and industrial researchers. The goal of an open domain dialogue system is to imitate humans in conversations. Previous works on single turn conversation generation have greatly promoted the research of open domain dialogue systems. However, understanding multiple single turn conversations is not equal to the understanding of multi turn dialogue due to the coherent and context dependent properties of human dialogue. Therefore, in open domain multi turn dialogue generation, it is essential to modeling the contextual semantics of the dialogue history, rather than only according to the last utterance. Previous research had verified the effectiveness of the hierarchical recurrent encoder-decoder framework on open domain multi turn dialogue generation. However, using RNN-based model to hierarchically encoding the utterances to obtain the representation of dialogue history still face the problem of a vanishing gradient. To address this issue, in this paper, we proposed a static and dynamic attention-based approach to model the dialogue history and then generate open domain multi turn dialogue responses. Experimental results on Ubuntu and Opensubtitles datasets verify the effectiveness of the proposed static and dynamic attention-based approach on automatic and human evaluation metrics in various experimental settings. Meanwhile, we also empirically verify the performance of combining the static and dynamic attentions on open domain multi turn dialogue generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20766v1-abstract-full').style.display = 'none'; document.getElementById('2410.20766v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">published as a journal paper at ACM Transactions on Information Systems 2023. 30 pages, 6 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ACM Trans. Inf. Syst. 41, 1, Article 15 (January 2023) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20285">arXiv:2410.20285</a> <span> [<a href="https://arxiv.org/pdf/2410.20285">pdf</a>, <a href="https://arxiv.org/format/2410.20285">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SWE-Search: Enhancing Software Agents with Monte Carlo Tree Search and Iterative Refinement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Antoniades%2C+A">Antonis Antoniades</a>, <a href="/search/cs?searchtype=author&query=%C3%96rwall%2C+A">Albert 脰rwall</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kexun Zhang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yuxi Xie</a>, <a href="/search/cs?searchtype=author&query=Goyal%2C+A">Anirudh Goyal</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">William Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20285v2-abstract-short" style="display: inline;"> Software engineers operating in complex and dynamic environments must continuously adapt to evolving requirements, learn iteratively from experience, and reconsider their approaches based on new insights. However, current large language model (LLM)-based software agents often rely on rigid processes and tend to repeat ineffective actions without the capacity to evaluate their performance or adapt… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20285v2-abstract-full').style.display = 'inline'; document.getElementById('2410.20285v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20285v2-abstract-full" style="display: none;"> Software engineers operating in complex and dynamic environments must continuously adapt to evolving requirements, learn iteratively from experience, and reconsider their approaches based on new insights. However, current large language model (LLM)-based software agents often rely on rigid processes and tend to repeat ineffective actions without the capacity to evaluate their performance or adapt their strategies over time. To address these challenges, we propose SWE-Search, a multi-agent framework that integrates Monte Carlo Tree Search (MCTS) with a self-improvement mechanism to enhance software agents' performance on repository-level software tasks. SWE-Search extends traditional MCTS by incorporating a hybrid value function that leverages LLMs for both numerical value estimation and qualitative evaluation. This enables self-feedback loops where agents iteratively refine their strategies based on both quantitative numerical evaluations and qualitative natural language assessments of pursued trajectories. The framework includes a SWE-Agent for adaptive exploration, a Value Agent for iterative feedback, and a Discriminator Agent that facilitates multi-agent debate for collaborative decision-making. Applied to the SWE-bench benchmark, our approach demonstrates a 23% relative improvement in performance across five models compared to standard open-source agents without MCTS. Our analysis reveals how performance scales with increased search depth and identifies key factors that facilitate effective self-evaluation in software agents. This work highlights the potential of self-evaluation driven search techniques to enhance agent reasoning and planning in complex, dynamic software engineering environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20285v2-abstract-full').style.display = 'none'; document.getElementById('2410.20285v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Main body: 10 pages, 5 figures. Appendix: 5 pages, 4 figures. Open-source codebase</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20174">arXiv:2410.20174</a> <span> [<a href="https://arxiv.org/pdf/2410.20174">pdf</a>, <a href="https://arxiv.org/format/2410.20174">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3563389">10.1145/3563389 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> A Stack-Propagation Framework for Low-Resource Personalized Dialogue Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Song%2C+H">Haoyu Song</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wei-Nan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kaiyan Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Ting Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20174v1-abstract-short" style="display: inline;"> With the resurgent interest in building open-domain dialogue systems, the dialogue generation task has attracted increasing attention over the past few years. This task is usually formulated as a conditional generation problem, which aims to generate a natural and meaningful response given dialogue contexts and specific constraints, such as persona. And maintaining a consistent persona is essentia… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20174v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20174v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20174v1-abstract-full" style="display: none;"> With the resurgent interest in building open-domain dialogue systems, the dialogue generation task has attracted increasing attention over the past few years. This task is usually formulated as a conditional generation problem, which aims to generate a natural and meaningful response given dialogue contexts and specific constraints, such as persona. And maintaining a consistent persona is essential for the dialogue systems to gain trust from the users. Although tremendous advancements have been brought, traditional persona-based dialogue models are typically trained by leveraging a large number of persona-dense dialogue examples. Yet, such persona-dense training data are expensive to obtain, leading to a limited scale. This work presents a novel approach to learning from limited training examples by regarding consistency understanding as a regularization of response generation. To this end, we propose a novel stack-propagation framework for learning a generation and understanding pipeline.Specifically, the framework stacks a Transformer encoder and two Transformer decoders, where the first decoder models response generation and the second serves as a regularizer and jointly models response generation and consistency understanding. The proposed framework can benefit from the stacked encoder and decoders to learn from much smaller personalized dialogue data while maintaining competitive performance. Under different low-resource settings, subjective and objective evaluations prove that the stack-propagation framework outperforms strong baselines in response quality and persona consistency and largely overcomes the shortcomings of traditional models that rely heavily on the persona-dense dialogue data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20174v1-abstract-full').style.display = 'none'; document.getElementById('2410.20174v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">published as a journal paper at ACM Transactions on Information Systems 2023. 35 pages, 5 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> ACM Trans. Inf. Syst. 41, 3, Article 68 (July 2023) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19276">arXiv:2410.19276</a> <span> [<a href="https://arxiv.org/pdf/2410.19276">pdf</a>, <a href="https://arxiv.org/format/2410.19276">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Learning ID-free Item Representation with Token Crossing for Multimodal Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kangning Zhang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+J">Jiarui Jin</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+Y">Yingjie Qin</a>, <a href="/search/cs?searchtype=author&query=Su%2C+R">Ruilong Su</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jianghao Lin</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yong Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weinan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19276v1-abstract-short" style="display: inline;"> Current multimodal recommendation models have extensively explored the effective utilization of multimodal information; however, their reliance on ID embeddings remains a performance bottleneck. Even with the assistance of multimodal information, optimizing ID embeddings remains challenging for ID-based Multimodal Recommender when interaction data is sparse. Furthermore, the unique nature of item-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19276v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19276v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19276v1-abstract-full" style="display: none;"> Current multimodal recommendation models have extensively explored the effective utilization of multimodal information; however, their reliance on ID embeddings remains a performance bottleneck. Even with the assistance of multimodal information, optimizing ID embeddings remains challenging for ID-based Multimodal Recommender when interaction data is sparse. Furthermore, the unique nature of item-specific ID embeddings hinders the information exchange among related items and the spatial requirement of ID embeddings increases with the scale of item. Based on these limitations, we propose an ID-free MultimOdal TOken Representation scheme named MOTOR that represents each item using learnable multimodal tokens and connects them through shared tokens. Specifically, we first employ product quantization to discretize each item's multimodal features (e.g., images, text) into discrete token IDs. We then interpret the token embeddings corresponding to these token IDs as implicit item features, introducing a new Token Cross Network to capture the implicit interaction patterns among these tokens. The resulting representations can replace the original ID embeddings and transform the original ID-based multimodal recommender into ID-free system, without introducing any additional loss design. MOTOR reduces the overall space requirements of these models, facilitating information interaction among related items, while also significantly enhancing the model's recommendation capability. Extensive experiments on nine mainstream models demonstrate the significant performance improvement achieved by MOTOR, highlighting its effectiveness in enhancing multimodal recommendation systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19276v1-abstract-full').style.display = 'none'; document.getElementById('2410.19276v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages,6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19265">arXiv:2410.19265</a> <span> [<a href="https://arxiv.org/pdf/2410.19265">pdf</a>, <a href="https://arxiv.org/format/2410.19265">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Survey of Deep Graph Learning under Distribution Shifts: from Graph Out-of-Distribution Generalization to Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kexin Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shuhan Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Song Wang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+W">Weili Shi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chen Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Pan Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Sheng Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jundong Li</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+K">Kaize Ding</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19265v1-abstract-short" style="display: inline;"> Distribution shifts on graphs -- the discrepancies in data distribution between training and employing a graph machine learning model -- are ubiquitous and often unavoidable in real-world scenarios. These shifts may severely deteriorate model performance, posing significant challenges for reliable graph machine learning. Consequently, there has been a surge in research on graph machine learning un… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19265v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19265v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19265v1-abstract-full" style="display: none;"> Distribution shifts on graphs -- the discrepancies in data distribution between training and employing a graph machine learning model -- are ubiquitous and often unavoidable in real-world scenarios. These shifts may severely deteriorate model performance, posing significant challenges for reliable graph machine learning. Consequently, there has been a surge in research on graph machine learning under distribution shifts, aiming to train models to achieve satisfactory performance on out-of-distribution (OOD) test data. In our survey, we provide an up-to-date and forward-looking review of deep graph learning under distribution shifts. Specifically, we cover three primary scenarios: graph OOD generalization, training-time graph OOD adaptation, and test-time graph OOD adaptation. We begin by formally formulating the problems and discussing various types of distribution shifts that can affect graph learning, such as covariate shifts and concept shifts. To provide a better understanding of the literature, we systematically categorize the existing models based on our proposed taxonomy and investigate the adopted techniques behind. We also summarize commonly used datasets in this research area to facilitate further investigation. Finally, we point out promising research directions and the corresponding challenges to encourage further study in this vital domain. Additionally, we provide a continuously updated reading list at https://github.com/kaize0409/Awesome-Graph-OOD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19265v1-abstract-full').style.display = 'none'; document.getElementById('2410.19265v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 2 figures. arXiv admin note: text overlap with arXiv:2402.11153</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18912">arXiv:2410.18912</a> <span> [<a href="https://arxiv.org/pdf/2410.18912">pdf</a>, <a href="https://arxiv.org/format/2410.18912">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Dynamic 3D Gaussian Tracking for Graph-Based Neural Dynamics Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mingtong Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kaifeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yunzhu Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18912v1-abstract-short" style="display: inline;"> Videos of robots interacting with objects encode rich information about the objects' dynamics. However, existing video prediction approaches typically do not explicitly account for the 3D information from videos, such as robot actions and objects' 3D states, limiting their use in real-world robotic applications. In this work, we introduce a framework to learn object dynamics directly from multi-vi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18912v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18912v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18912v1-abstract-full" style="display: none;"> Videos of robots interacting with objects encode rich information about the objects' dynamics. However, existing video prediction approaches typically do not explicitly account for the 3D information from videos, such as robot actions and objects' 3D states, limiting their use in real-world robotic applications. In this work, we introduce a framework to learn object dynamics directly from multi-view RGB videos by explicitly considering the robot's action trajectories and their effects on scene dynamics. We utilize the 3D Gaussian representation of 3D Gaussian Splatting (3DGS) to train a particle-based dynamics model using Graph Neural Networks. This model operates on sparse control particles downsampled from the densely tracked 3D Gaussian reconstructions. By learning the neural dynamics model on offline robot interaction data, our method can predict object motions under varying initial configurations and unseen robot actions. The 3D transformations of Gaussians can be interpolated from the motions of control particles, enabling the rendering of predicted future object states and achieving action-conditioned video prediction. The dynamics model can also be applied to model-based planning frameworks for object manipulation tasks. We conduct experiments on various kinds of deformable materials, including ropes, clothes, and stuffed animals, demonstrating our framework's ability to model complex shapes and dynamics. Our project page is available at https://gs-dynamics.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18912v1-abstract-full').style.display = 'none'; document.getElementById('2410.18912v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://gs-dynamics.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18396">arXiv:2410.18396</a> <span> [<a href="https://arxiv.org/pdf/2410.18396">pdf</a>, <a href="https://arxiv.org/ps/2410.18396">ps</a>, <a href="https://arxiv.org/format/2410.18396">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Revisiting Differentiable Structure Learning: Inconsistency of $\ell_1$ Penalty and Beyond </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jin%2C+K">Kaifeng Jin</a>, <a href="/search/cs?searchtype=author&query=Ng%2C+I">Ignavier Ng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kun Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+B">Biwei Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18396v1-abstract-short" style="display: inline;"> Recent advances in differentiable structure learning have framed the combinatorial problem of learning directed acyclic graphs as a continuous optimization problem. Various aspects, including data standardization, have been studied to identify factors that influence the empirical performance of these methods. In this work, we investigate critical limitations in differentiable structure learning me… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18396v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18396v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18396v1-abstract-full" style="display: none;"> Recent advances in differentiable structure learning have framed the combinatorial problem of learning directed acyclic graphs as a continuous optimization problem. Various aspects, including data standardization, have been studied to identify factors that influence the empirical performance of these methods. In this work, we investigate critical limitations in differentiable structure learning methods, focusing on settings where the true structure can be identified up to Markov equivalence classes, particularly in the linear Gaussian case. While Ng et al. (2024) highlighted potential non-convexity issues in this setting, we demonstrate and explain why the use of $\ell_1$-penalized likelihood in such cases is fundamentally inconsistent, even if the global optimum of the optimization problem can be found. To resolve this limitation, we develop a hybrid differentiable structure learning method based on $\ell_0$-penalized likelihood with hard acyclicity constraint, where the $\ell_0$ penalty can be approximated by different techniques including Gumbel-Softmax. Specifically, we first estimate the underlying moral graph, and use it to restrict the search space of the optimization problem, which helps alleviate the non-convexity issue. Experimental results show that the proposed method enhances empirical performance both before and after data standardization, providing a more reliable path for future advancements in differentiable structure learning, especially for learning Markov equivalence classes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18396v1-abstract-full').style.display = 'none'; document.getElementById('2410.18396v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18228">arXiv:2410.18228</a> <span> [<a href="https://arxiv.org/pdf/2410.18228">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MsMorph: An Unsupervised pyramid learning network for brain image registration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nan%2C+J">Jiaofen Nan</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+G">Gaodeng Fan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kaifan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Chen Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+F">Fubao Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+W">Weihua Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18228v1-abstract-short" style="display: inline;"> In the field of medical image analysis, image registration is a crucial technique. Despite the numerous registration models that have been proposed, existing methods still fall short in terms of accuracy and interpretability. In this paper, we present MsMorph, a deep learning-based image registration framework aimed at mimicking the manual process of registering image pairs to achieve more similar… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18228v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18228v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18228v1-abstract-full" style="display: none;"> In the field of medical image analysis, image registration is a crucial technique. Despite the numerous registration models that have been proposed, existing methods still fall short in terms of accuracy and interpretability. In this paper, we present MsMorph, a deep learning-based image registration framework aimed at mimicking the manual process of registering image pairs to achieve more similar deformations, where the registered image pairs exhibit consistency or similarity in features. By extracting the feature differences between image pairs across various as-pects using gradients, the framework decodes semantic information at different scales and continuously compen-sates for the predicted deformation field, driving the optimization of parameters to significantly improve registration accuracy. The proposed method simulates the manual approach to registration, focusing on different regions of the image pairs and their neighborhoods to predict the deformation field between the two images, which provides strong interpretability. We compared several existing registration methods on two public brain MRI datasets, including LPBA and Mindboggle. The experimental results show that our method consistently outperforms state of the art in terms of metrics such as Dice score, Hausdorff distance, average symmetric surface distance, and non-Jacobian. The source code is publicly available at https://github.com/GaodengFan/MsMorph <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18228v1-abstract-full').style.display = 'none'; document.getElementById('2410.18228v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18071">arXiv:2410.18071</a> <span> [<a href="https://arxiv.org/pdf/2410.18071">pdf</a>, <a href="https://arxiv.org/format/2410.18071">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> TP-Eval: Tap Multimodal LLMs' Potential in Evaluation by Customizing Prompts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yuxuan Xie</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tianhua Li</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+W">Wenqi Shao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kaipeng Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18071v1-abstract-short" style="display: inline;"> Recently, multimodal large language models (MLLMs) have received much attention for their impressive capabilities. The evaluation of MLLMs is becoming critical to analyzing attributes of MLLMs and providing valuable insights. However, current benchmarks overlook the problem of prompt sensitivity - minor prompt variations may lead to significant performance fluctuations. Thus, inappropriate prompts… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18071v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18071v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18071v1-abstract-full" style="display: none;"> Recently, multimodal large language models (MLLMs) have received much attention for their impressive capabilities. The evaluation of MLLMs is becoming critical to analyzing attributes of MLLMs and providing valuable insights. However, current benchmarks overlook the problem of prompt sensitivity - minor prompt variations may lead to significant performance fluctuations. Thus, inappropriate prompts may obscure the models' capabilities, underestimating the models' performance. Moreover, different models have different preferences for different prompts, and thus, using the same prompt for all models will cause evaluation bias. This paper analyzes this deficiency in existing benchmarks and further introduces a new evaluation framework named TP-Eval, which introduces a prompt customization method to reduce evaluation biases and tap models' potential. TP-Eval will rewrite the original prompts to different customized prompts for different models. In particular, we propose some well-designed modules for prompt customization tailored to the scenario of MLLM evaluation. Extensive experiments demonstrate the effectiveness of our approach to uncovering models' capabilities, and TP-Eval should benefit the community in developing more comprehensive and convincing MLLM evaluation benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18071v1-abstract-full').style.display = 'none'; document.getElementById('2410.18071v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17242">arXiv:2410.17242</a> <span> [<a href="https://arxiv.org/pdf/2410.17242">pdf</a>, <a href="https://arxiv.org/format/2410.17242">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> LVSM: A Large View Synthesis Model with Minimal 3D Inductive Bias </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jin%2C+H">Haian Jin</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Hanwen Jiang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+H">Hao Tan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+S">Sai Bi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tianyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a>, <a href="/search/cs?searchtype=author&query=Snavely%2C+N">Noah Snavely</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zexiang Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17242v1-abstract-short" style="display: inline;"> We propose the Large View Synthesis Model (LVSM), a novel transformer-based approach for scalable and generalizable novel view synthesis from sparse-view inputs. We introduce two architectures: (1) an encoder-decoder LVSM, which encodes input image tokens into a fixed number of 1D latent tokens, functioning as a fully learned scene representation, and decodes novel-view images from them; and (2) a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17242v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17242v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17242v1-abstract-full" style="display: none;"> We propose the Large View Synthesis Model (LVSM), a novel transformer-based approach for scalable and generalizable novel view synthesis from sparse-view inputs. We introduce two architectures: (1) an encoder-decoder LVSM, which encodes input image tokens into a fixed number of 1D latent tokens, functioning as a fully learned scene representation, and decodes novel-view images from them; and (2) a decoder-only LVSM, which directly maps input images to novel-view outputs, completely eliminating intermediate scene representations. Both models bypass the 3D inductive biases used in previous methods -- from 3D representations (e.g., NeRF, 3DGS) to network designs (e.g., epipolar projections, plane sweeps) -- addressing novel view synthesis with a fully data-driven approach. While the encoder-decoder model offers faster inference due to its independent latent representation, the decoder-only LVSM achieves superior quality, scalability, and zero-shot generalization, outperforming previous state-of-the-art methods by 1.5 to 3.5 dB PSNR. Comprehensive evaluations across multiple datasets demonstrate that both LVSM variants achieve state-of-the-art novel view synthesis quality. Notably, our models surpass all previous methods even with reduced computational resources (1-2 GPUs). Please see our website for more details: https://haian-jin.github.io/projects/LVSM/ . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17242v1-abstract-full').style.display = 'none'; document.getElementById('2410.17242v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">project page: https://haian-jin.github.io/projects/LVSM/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16059">arXiv:2410.16059</a> <span> [<a href="https://arxiv.org/pdf/2410.16059">pdf</a>, <a href="https://arxiv.org/format/2410.16059">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Multi-Level Speaker Representation for Target Speaker Extraction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Ke Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Junjie Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuai Wang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Y">Yangjie Wei</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yi Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yannan Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haizhou Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16059v1-abstract-short" style="display: inline;"> Target speaker extraction (TSE) relies on a reference cue of the target to extract the target speech from a speech mixture. While a speaker embedding is commonly used as the reference cue, such embedding pre-trained with a large number of speakers may suffer from confusion of speaker identity. In this work, we propose a multi-level speaker representation approach, from raw features to neural embed… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16059v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16059v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16059v1-abstract-full" style="display: none;"> Target speaker extraction (TSE) relies on a reference cue of the target to extract the target speech from a speech mixture. While a speaker embedding is commonly used as the reference cue, such embedding pre-trained with a large number of speakers may suffer from confusion of speaker identity. In this work, we propose a multi-level speaker representation approach, from raw features to neural embeddings, to serve as the speaker reference cue. We generate a spectral-level representation from the enrollment magnitude spectrogram as a raw, low-level feature, which significantly improves the model's generalization capability. Additionally, we propose a contextual embedding feature based on cross-attention mechanisms that integrate frame-level embeddings from a pre-trained speaker encoder. By incorporating speaker features across multiple levels, we significantly enhance the performance of the TSE model. Our approach achieves a 2.74 dB improvement and a 4.94% increase in extraction accuracy on Libri2mix test set over the baseline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16059v1-abstract-full').style.display = 'none'; document.getElementById('2410.16059v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">5 pages. Submitted to ICASSP 2025. Implementation will be released at https://github.com/wenet-e2e/wesep</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16037">arXiv:2410.16037</a> <span> [<a href="https://arxiv.org/pdf/2410.16037">pdf</a>, <a href="https://arxiv.org/ps/2410.16037">ps</a>, <a href="https://arxiv.org/format/2410.16037">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Improving the Multi-label Atomic Activity Recognition by Robust Visual Feature and Advanced Attention @ ROAD++ Atomic Activity Recognition 2024 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+J">Jiamin Cao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lingqi Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kexin Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yuting Yang</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+L">Licheng Jiao</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yuwei Guo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16037v1-abstract-short" style="display: inline;"> Road++ Track3 proposes a multi-label atomic activity recognition task in traffic scenarios, which can be standardized as a 64-class multi-label video action recognition task. In the multi-label atomic activity recognition task, the robustness of visual feature extraction remains a key challenge, which directly affects the model performance and generalization ability. To cope with these issues, our… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16037v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16037v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16037v1-abstract-full" style="display: none;"> Road++ Track3 proposes a multi-label atomic activity recognition task in traffic scenarios, which can be standardized as a 64-class multi-label video action recognition task. In the multi-label atomic activity recognition task, the robustness of visual feature extraction remains a key challenge, which directly affects the model performance and generalization ability. To cope with these issues, our team optimized three aspects: data processing, model and post-processing. Firstly, the appropriate resolution and video sampling strategy are selected, and a fixed sampling strategy is set on the validation and test sets. Secondly, in terms of model training, the team selects a variety of visual backbone networks for feature extraction, and then introduces the action-slot model, which is trained on the training and validation sets, and reasoned on the test set. Finally, for post-processing, the team combined the strengths and weaknesses of different models for weighted fusion, and the final mAP on the test set was 58%, which is 4% higher than the challenge baseline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16037v1-abstract-full').style.display = 'none'; document.getElementById('2410.16037v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15319">arXiv:2410.15319</a> <span> [<a href="https://arxiv.org/pdf/2410.15319">pdf</a>, <a href="https://arxiv.org/format/2410.15319">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Causality for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+A">Anpeng Wu</a>, <a href="/search/cs?searchtype=author&query=Kuang%2C+K">Kun Kuang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Minqin Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yingrong Wang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yujia Zheng</a>, <a href="/search/cs?searchtype=author&query=Han%2C+K">Kairong Han</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Baohong Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guangyi Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fei Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15319v1-abstract-short" style="display: inline;"> Recent breakthroughs in artificial intelligence have driven a paradigm shift, where large language models (LLMs) with billions or trillions of parameters are trained on vast datasets, achieving unprecedented success across a series of language tasks. However, despite these successes, LLMs still rely on probabilistic modeling, which often captures spurious correlations rooted in linguistic patterns… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15319v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15319v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15319v1-abstract-full" style="display: none;"> Recent breakthroughs in artificial intelligence have driven a paradigm shift, where large language models (LLMs) with billions or trillions of parameters are trained on vast datasets, achieving unprecedented success across a series of language tasks. However, despite these successes, LLMs still rely on probabilistic modeling, which often captures spurious correlations rooted in linguistic patterns and social stereotypes, rather than the true causal relationships between entities and events. This limitation renders LLMs vulnerable to issues such as demographic biases, social stereotypes, and LLM hallucinations. These challenges highlight the urgent need to integrate causality into LLMs, moving beyond correlation-driven paradigms to build more reliable and ethically aligned AI systems. While many existing surveys and studies focus on utilizing prompt engineering to activate LLMs for causal knowledge or developing benchmarks to assess their causal reasoning abilities, most of these efforts rely on human intervention to activate pre-trained models. How to embed causality into the training process of LLMs and build more general and intelligent models remains unexplored. Recent research highlights that LLMs function as causal parrots, capable of reciting causal knowledge without truly understanding or applying it. These prompt-based methods are still limited to human interventional improvements. This survey aims to address this gap by exploring how causality can enhance LLMs at every stage of their lifecycle-from token embedding learning and foundation model training to fine-tuning, alignment, inference, and evaluation-paving the way for more interpretable, reliable, and causally-informed models. Additionally, we further outline six promising future directions to advance LLM development, enhance their causal reasoning capabilities, and address the current limitations these models face. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15319v1-abstract-full').style.display = 'none'; document.getElementById('2410.15319v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15273">arXiv:2410.15273</a> <span> [<a href="https://arxiv.org/pdf/2410.15273">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> ArchiTone: A LEGO-Inspired Gamified System for Visualized Music Education </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jiaxing Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tieyao Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+S">Songruoyao Wu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xinda Wu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+T">Tingxiao Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yanjun Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kejun Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15273v1-abstract-short" style="display: inline;"> Participation in music activities has many benefits, but often requires music theory knowledge and aural skills, which can be challenging for beginners. To help them engage more easily, it's crucial to adopt teaching strategies that lower these barriers. Informed by formative investigation and inspired by LEGO, we introduce ArchiTone, a gamified system that employs constructivism by visualizing mu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15273v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15273v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15273v1-abstract-full" style="display: none;"> Participation in music activities has many benefits, but often requires music theory knowledge and aural skills, which can be challenging for beginners. To help them engage more easily, it's crucial to adopt teaching strategies that lower these barriers. Informed by formative investigation and inspired by LEGO, we introduce ArchiTone, a gamified system that employs constructivism by visualizing music theory concepts as musical blocks and buildings for music education. This system includes two modes: Learning Mode, which involves recognizing and learning common musical blocks through familiar musical works; Creation Mode, which allows learners to freely create and combine musical blocks to produce new musical works. User studies demonstrate that our gamified system is not only more engaging than traditional music education methods but also more effective in helping learners understand abstract music theory and apply it to music praxis. Additionally, learners demonstrate superior performance on music theory tasks after using ArchiTone. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15273v1-abstract-full').style.display = 'none'; document.getElementById('2410.15273v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14214">arXiv:2410.14214</a> <span> [<a href="https://arxiv.org/pdf/2410.14214">pdf</a>, <a href="https://arxiv.org/format/2410.14214">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> MambaSCI: Efficient Mamba-UNet for Quad-Bayer Patterned Video Snapshot Compressive Imaging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pan%2C+Z">Zhenghao Pan</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+H">Haijin Zeng</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+J">Jiezhang Cao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yongyong Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yong Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14214v1-abstract-short" style="display: inline;"> Color video snapshot compressive imaging (SCI) employs computational imaging techniques to capture multiple sequential video frames in a single Bayer-patterned measurement. With the increasing popularity of quad-Bayer pattern in mainstream smartphone cameras for capturing high-resolution videos, mobile photography has become more accessible to a wider audience. However, existing color video SCI re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14214v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14214v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14214v1-abstract-full" style="display: none;"> Color video snapshot compressive imaging (SCI) employs computational imaging techniques to capture multiple sequential video frames in a single Bayer-patterned measurement. With the increasing popularity of quad-Bayer pattern in mainstream smartphone cameras for capturing high-resolution videos, mobile photography has become more accessible to a wider audience. However, existing color video SCI reconstruction algorithms are designed based on the traditional Bayer pattern. When applied to videos captured by quad-Bayer cameras, these algorithms often result in color distortion and ineffective demosaicing, rendering them impractical for primary equipment. To address this challenge, we propose the MambaSCI method, which leverages the Mamba and UNet architectures for efficient reconstruction of quad-Bayer patterned color video SCI. To the best of our knowledge, our work presents the first algorithm for quad-Bayer patterned SCI reconstruction, and also the initial application of the Mamba model to this task. Specifically, we customize Residual-Mamba-Blocks, which residually connect the Spatial-Temporal Mamba (STMamba), Edge-Detail-Reconstruction (EDR) module, and Channel Attention (CA) module. Respectively, STMamba is used to model long-range spatial-temporal dependencies with linear complexity, EDR is for better edge-detail reconstruction, and CA is used to compensate for the missing channel information interaction in Mamba model. Experiments demonstrate that MambaSCI surpasses state-of-the-art methods with lower computational and memory costs. PyTorch style pseudo-code for the core modules is provided in the supplementary materials. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14214v1-abstract-full').style.display = 'none'; document.getElementById('2410.14214v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13754">arXiv:2410.13754</a> <span> [<a href="https://arxiv.org/pdf/2410.13754">pdf</a>, <a href="https://arxiv.org/format/2410.13754">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> MixEval-X: Any-to-Any Evaluations from Real-World Data Mixtures </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ni%2C+J">Jinjie Ni</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yifan Song</a>, <a href="/search/cs?searchtype=author&query=Ghosal%2C+D">Deepanway Ghosal</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bo Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D+J">David Junhao Zhang</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+X">Xiang Yue</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+F">Fuzhao Xue</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zian Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kaichen Zhang</a>, <a href="/search/cs?searchtype=author&query=Shah%2C+M">Mahir Shah</a>, <a href="/search/cs?searchtype=author&query=Jain%2C+K">Kabir Jain</a>, <a href="/search/cs?searchtype=author&query=You%2C+Y">Yang You</a>, <a href="/search/cs?searchtype=author&query=Shieh%2C+M">Michael Shieh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13754v2-abstract-short" style="display: inline;"> Perceiving and generating diverse modalities are crucial for AI models to effectively learn from and engage with real-world signals, necessitating reliable evaluations for their development. We identify two major issues in current evaluations: (1) inconsistent standards, shaped by different communities with varying protocols and maturity levels; and (2) significant query, grading, and generalizati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13754v2-abstract-full').style.display = 'inline'; document.getElementById('2410.13754v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13754v2-abstract-full" style="display: none;"> Perceiving and generating diverse modalities are crucial for AI models to effectively learn from and engage with real-world signals, necessitating reliable evaluations for their development. We identify two major issues in current evaluations: (1) inconsistent standards, shaped by different communities with varying protocols and maturity levels; and (2) significant query, grading, and generalization biases. To address these, we introduce MixEval-X, the first any-to-any, real-world benchmark designed to optimize and standardize evaluations across diverse input and output modalities. We propose multi-modal benchmark mixture and adaptation-rectification pipelines to reconstruct real-world task distributions, ensuring evaluations generalize effectively to real-world use cases. Extensive meta-evaluations show our approach effectively aligns benchmark samples with real-world task distributions. Meanwhile, MixEval-X's model rankings correlate strongly with that of crowd-sourced real-world evaluations (up to 0.98) while being much more efficient. We provide comprehensive leaderboards to rerank existing models and organizations and offer insights to enhance understanding of multi-modal evaluations and inform future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13754v2-abstract-full').style.display = 'none'; document.getElementById('2410.13754v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13343">arXiv:2410.13343</a> <span> [<a href="https://arxiv.org/pdf/2410.13343">pdf</a>, <a href="https://arxiv.org/format/2410.13343">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Do LLMs Overcome Shortcut Learning? An Evaluation of Shortcut Challenges in Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yu Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+L">Lili Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+G">Guangting Zheng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qi Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13343v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have shown remarkable capabilities in various natural language processing tasks. However, LLMs may rely on dataset biases as shortcuts for prediction, which can significantly impair their robustness and generalization capabilities. This paper presents Shortcut Suite, a comprehensive test suite designed to evaluate the impact of shortcuts on LLMs' performance, incorpora… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13343v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13343v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13343v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have shown remarkable capabilities in various natural language processing tasks. However, LLMs may rely on dataset biases as shortcuts for prediction, which can significantly impair their robustness and generalization capabilities. This paper presents Shortcut Suite, a comprehensive test suite designed to evaluate the impact of shortcuts on LLMs' performance, incorporating six shortcut types, five evaluation metrics, and four prompting strategies. Our extensive experiments yield several key findings: 1) LLMs demonstrate varying reliance on shortcuts for downstream tasks, significantly impairing their performance. 2) Larger LLMs are more likely to utilize shortcuts under zero-shot and few-shot in-context learning prompts. 3) Chain-of-thought prompting notably reduces shortcut reliance and outperforms other prompting strategies, while few-shot prompts generally underperform compared to zero-shot prompts. 4) LLMs often exhibit overconfidence in their predictions, especially when dealing with datasets that contain shortcuts. 5) LLMs generally have a lower explanation quality in shortcut-laden datasets, with errors falling into three types: distraction, disguised comprehension, and logical fallacy. Our findings offer new insights for evaluating robustness and generalization in LLMs and suggest potential directions for mitigating the reliance on shortcuts. The code is available at \url {https://github.com/yyhappier/ShortcutSuite.git}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13343v1-abstract-full').style.display = 'none'; document.getElementById('2410.13343v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13187">arXiv:2410.13187</a> <span> [<a href="https://arxiv.org/pdf/2410.13187">pdf</a>, <a href="https://arxiv.org/format/2410.13187">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> aiXcoder-7B: A Lightweight and Effective Large Language Model for Code Completion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+S">Siyuan Jiang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jia Li</a>, <a href="/search/cs?searchtype=author&query=Zong%2C+H">He Zong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Huanyu Liu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Hao Zhu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shukai Hu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+E">Erlu Li</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+J">Jiazheng Ding</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Y">Yu Han</a>, <a href="/search/cs?searchtype=author&query=Ning%2C+W">Wei Ning</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Gen Wang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yihong Dong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kechi Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Ge Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13187v2-abstract-short" style="display: inline;"> Large Language Models (LLMs) have been widely used in code completion, and researchers are focusing on scaling up LLMs to improve their accuracy. However, larger LLMs will increase the response time of code completion and decrease the developers' productivity. In this paper, we propose a lightweight and effective LLM for code completion named aiXcoder-7B. Compared to existing LLMs, aiXcoder-7B ach… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13187v2-abstract-full').style.display = 'inline'; document.getElementById('2410.13187v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13187v2-abstract-full" style="display: none;"> Large Language Models (LLMs) have been widely used in code completion, and researchers are focusing on scaling up LLMs to improve their accuracy. However, larger LLMs will increase the response time of code completion and decrease the developers' productivity. In this paper, we propose a lightweight and effective LLM for code completion named aiXcoder-7B. Compared to existing LLMs, aiXcoder-7B achieves higher code completion accuracy while having smaller scales (i.e., 7 billion parameters). We attribute the superiority of aiXcoder-7B to three key factors: (1) Multi-objective training. We employ three training objectives, one of which is our proposed Structured Fill-In-the-Middle (SFIM). SFIM considers the syntax structures in code and effectively improves the performance of LLMs for code. (2) Diverse data sampling strategies. They consider inter-file relationships and enhance the capability of LLMs in understanding cross-file contexts. (3) Extensive high-quality data. We establish a rigorous data collection pipeline and consume a total of 1.2 trillion unique tokens for training aiXcoder-7B. This vast volume of data enables aiXcoder-7B to learn a broad distribution of code. We evaluate aiXcoder-7B in five popular code completion benchmarks and a new benchmark collected by this paper. The results show that aiXcoder-7B outperforms the latest six LLMs with similar sizes and even surpasses four larger LLMs (e.g., StarCoder2-15B and CodeLlama-34B), positioning aiXcoder-7B as a lightweight and effective LLM for academia and industry. Finally, we summarize three valuable insights for helping practitioners train the next generations of LLMs for code. aiXcoder-7B has been open-souced and gained significant attention. As of the submission date, aiXcoder-7B has received 2,193 GitHub Stars. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13187v2-abstract-full').style.display = 'none'; document.getElementById('2410.13187v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">aiXcoder-7B is available at https://github.com/aixcoder-plugin/aiXcoder-7B</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12781">arXiv:2410.12781</a> <span> [<a href="https://arxiv.org/pdf/2410.12781">pdf</a>, <a href="https://arxiv.org/format/2410.12781">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Long-LRM: Long-sequence Large Reconstruction Model for Wide-coverage Gaussian Splats </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ziwen%2C+C">Chen Ziwen</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+H">Hao Tan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+S">Sai Bi</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+F">Fujun Luan</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+Y">Yicong Hong</a>, <a href="/search/cs?searchtype=author&query=Fuxin%2C+L">Li Fuxin</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zexiang Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12781v1-abstract-short" style="display: inline;"> We propose Long-LRM, a generalizable 3D Gaussian reconstruction model that is capable of reconstructing a large scene from a long sequence of input images. Specifically, our model can process 32 source images at 960x540 resolution within only 1.3 seconds on a single A100 80G GPU. Our architecture features a mixture of the recent Mamba2 blocks and the classical transformer blocks which allowed many… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12781v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12781v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12781v1-abstract-full" style="display: none;"> We propose Long-LRM, a generalizable 3D Gaussian reconstruction model that is capable of reconstructing a large scene from a long sequence of input images. Specifically, our model can process 32 source images at 960x540 resolution within only 1.3 seconds on a single A100 80G GPU. Our architecture features a mixture of the recent Mamba2 blocks and the classical transformer blocks which allowed many more tokens to be processed than prior work, enhanced by efficient token merging and Gaussian pruning steps that balance between quality and efficiency. Unlike previous feed-forward models that are limited to processing 1~4 input images and can only reconstruct a small portion of a large scene, Long-LRM reconstructs the entire scene in a single feed-forward step. On large-scale scene datasets such as DL3DV-140 and Tanks and Temples, our method achieves performance comparable to optimization-based approaches while being two orders of magnitude more efficient. Project page: https://arthurhero.github.io/projects/llrm <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12781v1-abstract-full').style.display = 'none'; document.getElementById('2410.12781v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Zhang%2C+K&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Zhang%2C+K&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+K&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+K&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+K&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Zhang%2C+K&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository