Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 1,272 results for author: <span class="mathjax">Lin, J</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Lin%2C+J">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Lin, J"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Lin%2C+J&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Lin, J"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Lin%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Lin%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Lin%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Lin%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Lin%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Lin%2C+J&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13580">arXiv:2411.13580</a> <span> [<a href="https://arxiv.org/pdf/2411.13580">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1016/j.autcon.2017.06.021">10.1016/j.autcon.2017.06.021 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> A Multi-Server Information-Sharing Environment for Cross-Party Collaboration on A Private Cloud </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianping Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qiang Liu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Z">Zhenzhong Hu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jiarui Lin</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+F">Fangqiang Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13580v1-abstract-short" style="display: inline;"> Interoperability remains the key problem in multi-discipline collaboration based on building information modeling (BIM). Although various methods have been proposed to solve the technical issues of interoperability, such as data sharing and data consistency; organizational issues, including data ownership and data privacy, remain unresolved to date. These organizational issues prevent different st… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13580v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13580v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13580v1-abstract-full" style="display: none;"> Interoperability remains the key problem in multi-discipline collaboration based on building information modeling (BIM). Although various methods have been proposed to solve the technical issues of interoperability, such as data sharing and data consistency; organizational issues, including data ownership and data privacy, remain unresolved to date. These organizational issues prevent different stakeholders from sharing their data due to concerns regarding losing control of the data. This study proposes a multi-server information-sharing approach on a private cloud after analyzing the requirements for cross-party collaboration to address the aforementioned issues and prepare for massive data handling in the near future. This approach adopts a global controller to track the location, ownership and privacy of the data, which are stored in different servers that are controlled by different parties. Furthermore, data consistency conventions, parallel sub-model extraction, and sub-model integration with model verification are investigated in depth to support information sharing in a distributed environment and to maintain data consistency. Thus, with this approach, the ownership and privacy of the data can be controlled by its owner while still enabling certain required data to be shared with other parties. Application of the multi-server approach for information interoperability and cross-party collaboration is illustrated using a real construction project of an airport terminal. Validation shows that the proposed approach is feasible for maintaining the ownership and privacy of the data while supporting cross-party data sharing and collaboration at the same time, thus avoiding possible legal problems regarding data copyrights or other legal issues. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13580v1-abstract-full').style.display = 'none'; document.getElementById('2411.13580v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Automation in Construction,2017 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12250">arXiv:2411.12250</a> <span> [<a href="https://arxiv.org/pdf/2411.12250">pdf</a>, <a href="https://arxiv.org/format/2411.12250">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> ADV2E: Bridging the Gap Between Analogue Circuit and Discrete Frames in the Video-to-Events Simulator </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xiao Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+F">Fei Zhou</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jiongzhi Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12250v1-abstract-short" style="display: inline;"> Event cameras operate fundamentally differently from traditional Active Pixel Sensor (APS) cameras, offering significant advantages. Recent research has developed simulators to convert video frames into events, addressing the shortage of real event datasets. Current simulators primarily focus on the logical behavior of event cameras. However, the fundamental analogue properties of pixel circuits a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12250v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12250v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12250v1-abstract-full" style="display: none;"> Event cameras operate fundamentally differently from traditional Active Pixel Sensor (APS) cameras, offering significant advantages. Recent research has developed simulators to convert video frames into events, addressing the shortage of real event datasets. Current simulators primarily focus on the logical behavior of event cameras. However, the fundamental analogue properties of pixel circuits are seldom considered in simulator design. The gap between analogue pixel circuit and discrete video frames causes the degeneration of synthetic events, particularly in high-contrast scenes. In this paper, we propose a novel method of generating reliable event data based on a detailed analysis of the pixel circuitry in event cameras. We incorporate the analogue properties of event camera pixel circuits into the simulator design: (1) analogue filtering of signals from light intensity to events, and (2) a cutoff frequency that is independent of video frame rate. Experimental results on two relevant tasks, including semantic segmentation and image reconstruction, validate the reliability of simulated event data, even in high-contrast scenes. This demonstrates that deep neural networks exhibit strong generalization from simulated to real event data, confirming that the synthetic events generated by the proposed method are both realistic and well-suited for effective training. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12250v1-abstract-full').style.display = 'none'; document.getElementById('2411.12250v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10685">arXiv:2411.10685</a> <span> [<a href="https://arxiv.org/pdf/2411.10685">pdf</a>, <a href="https://arxiv.org/format/2411.10685">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> From Prototypes to General Distributions: An Efficient Curriculum for Masked Image Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jinhong Lin</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Cheng-En Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Huanran Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jifan Zhang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y+H">Yu Hen Hu</a>, <a href="/search/cs?searchtype=author&query=Morgado%2C+P">Pedro Morgado</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10685v1-abstract-short" style="display: inline;"> Masked Image Modeling (MIM) has emerged as a powerful self-supervised learning paradigm for visual representation learning, enabling models to acquire rich visual representations by predicting masked portions of images from their visible regions. While this approach has shown promising results, we hypothesize that its effectiveness may be limited by optimization challenges during early training st… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10685v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10685v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10685v1-abstract-full" style="display: none;"> Masked Image Modeling (MIM) has emerged as a powerful self-supervised learning paradigm for visual representation learning, enabling models to acquire rich visual representations by predicting masked portions of images from their visible regions. While this approach has shown promising results, we hypothesize that its effectiveness may be limited by optimization challenges during early training stages, where models are expected to learn complex image distributions from partial observations before developing basic visual processing capabilities. To address this limitation, we propose a prototype-driven curriculum leagrning framework that structures the learning process to progress from prototypical examples to more complex variations in the dataset. Our approach introduces a temperature-based annealing scheme that gradually expands the training distribution, enabling more stable and efficient learning trajectories. Through extensive experiments on ImageNet-1K, we demonstrate that our curriculum learning strategy significantly improves both training efficiency and representation quality while requiring substantially fewer training epochs compared to standard Masked Auto-Encoding. Our findings suggest that carefully controlling the order of training examples plays a crucial role in self-supervised visual learning, providing a practical solution to the early-stage optimization challenges in MIM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10685v1-abstract-full').style.display = 'none'; document.getElementById('2411.10685v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09951">arXiv:2411.09951</a> <span> [<a href="https://arxiv.org/pdf/2411.09951">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1111/mice.12151">10.1111/mice.12151 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> A natural-language-based approach to intelligent data retrieval and representation for cloud BIM </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jia-Rui Lin</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Z">Zhen-Zhong Hu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jian-Ping Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+F">Fang-Qiang Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09951v1-abstract-short" style="display: inline;"> As the information from diverse disciplines continues to integrate during the whole life cycle of an Architecture, Engineering, and Construction (AEC) project, the BIM (Building Information Model/Modeling) becomes increasingly large. This condition will cause users difficulty in acquiring the information they truly desire on a mobile device with limited space for interaction. To improve the value… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09951v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09951v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09951v1-abstract-full" style="display: none;"> As the information from diverse disciplines continues to integrate during the whole life cycle of an Architecture, Engineering, and Construction (AEC) project, the BIM (Building Information Model/Modeling) becomes increasingly large. This condition will cause users difficulty in acquiring the information they truly desire on a mobile device with limited space for interaction. To improve the value of the big data of BIM, an approach to intelligent data retrieval and representation for cloud BIM applications based on natural language processing was proposed. First, strategies for data storage and query acceleration based on the popular cloud-based database were explored to handle the large amount of BIM data. Then, the concepts keyword and constraint were proposed to capture the key objects and their specifications in a natural-language-based sentence that expresses the requirements of the user. Keywords and constraints can be mapped to IFC entities or properties through the International Framework for Dictionaries (IFD). The relationship between the user's requirement and the IFC-based data model was established by path finding in a graph generated from the IFC schema, enabling data retrieval and analysis. Finally, the analyzed and summarized results of BIM data were represented based on the structure of the retrieved data. A prototype application was developed to validate the proposed approach on the data collected during the construction of the terminal of Kunming Airport, the largest single building in China. With this approach, users can significantly benefit from requesting for information and the value of BIM will be enhanced. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09951v1-abstract-full').style.display = 'none'; document.getElementById('2411.09951v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Computer Aided Civil and Infrastructure Engineering, 2016 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09607">arXiv:2411.09607</a> <span> [<a href="https://arxiv.org/pdf/2411.09607">pdf</a>, <a href="https://arxiv.org/format/2411.09607">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Initial Nugget Evaluation Results for the TREC 2024 RAG Track with the AutoNuggetizer Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pradeep%2C+R">Ronak Pradeep</a>, <a href="/search/cs?searchtype=author&query=Thakur%2C+N">Nandan Thakur</a>, <a href="/search/cs?searchtype=author&query=Upadhyay%2C+S">Shivani Upadhyay</a>, <a href="/search/cs?searchtype=author&query=Campos%2C+D">Daniel Campos</a>, <a href="/search/cs?searchtype=author&query=Craswell%2C+N">Nick Craswell</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jimmy Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09607v1-abstract-short" style="display: inline;"> This report provides an initial look at partial results from the TREC 2024 Retrieval-Augmented Generation (RAG) Track. We have identified RAG evaluation as a barrier to continued progress in information access (and more broadly, natural language processing and artificial intelligence), and it is our hope that we can contribute to tackling the many challenges in this space. The central hypothesis w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09607v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09607v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09607v1-abstract-full" style="display: none;"> This report provides an initial look at partial results from the TREC 2024 Retrieval-Augmented Generation (RAG) Track. We have identified RAG evaluation as a barrier to continued progress in information access (and more broadly, natural language processing and artificial intelligence), and it is our hope that we can contribute to tackling the many challenges in this space. The central hypothesis we explore in this work is that the nugget evaluation methodology, originally developed for the TREC Question Answering Track in 2003, provides a solid foundation for evaluating RAG systems. As such, our efforts have focused on "refactoring" this methodology, specifically applying large language models to both automatically create nuggets and to automatically assign nuggets to system answers. We call this the AutoNuggetizer framework. Within the TREC setup, we are able to calibrate our fully automatic process against a manual process whereby nuggets are created by human assessors semi-manually and then assigned manually to system answers. Based on initial results across 21 topics from 45 runs, we observe a strong correlation between scores derived from a fully automatic nugget evaluation and a (mostly) manual nugget evaluation by human assessors. This suggests that our fully automatic evaluation process can be used to guide future iterations of RAG systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09607v1-abstract-full').style.display = 'none'; document.getElementById('2411.09607v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09486">arXiv:2411.09486</a> <span> [<a href="https://arxiv.org/pdf/2411.09486">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1016/j.autcon.2021.103643">10.1016/j.autcon.2021.103643 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> An Approach to Twinning and Mining Collaborative Network of Construction Projects </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jia-Rui Lin</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+D">Da-Peng Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09486v1-abstract-short" style="display: inline;"> Understanding complex collaboration processes is essential for the success of construction projects. However, there is still a lack of efficient methods for timely collection and analysis of collaborative networks. Therefore, an integrated framework consisting three parts, namely, system updating for data collection, data preprocessing, and social network analysis, is proposed for the twinning and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09486v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09486v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09486v1-abstract-full" style="display: none;"> Understanding complex collaboration processes is essential for the success of construction projects. However, there is still a lack of efficient methods for timely collection and analysis of collaborative networks. Therefore, an integrated framework consisting three parts, namely, system updating for data collection, data preprocessing, and social network analysis, is proposed for the twinning and mining collaborative network of a construction project. First, a system updating strategy for automatic data collection is introduced. Centrality measures are then utilized to identify key players, including hubs and brokers. Meanwhile, information sharing frequency (ISF) and association rule mining are introduced to discover collaborative patterns, that is, frequently collaborating users (FCUs) and associations between information flows and task levels. Finally, the proposed framework is validated and demonstrated in a large-scale project. The results show that key players, FCUs, and associations between information flows and task levels were successfully discovered, providing a deep understanding of collaboration and communication for decision-making processes. This research contributes to the body of knowledge by: 1) introducing ISF and Apriori-based association mining algorithm to identify FCUs and information flow patterns in collaboration; 2) establishing a new data-driven framework to map and analyze fine-grained collaborative networks automatically. It is also shown that people tend to form small groups to handle certain levels or types of tasks more efficiently. Other researchers and industrial practitioners may use this work as a foundation to further improve the efficiency of collaboration and communication. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09486v1-abstract-full').style.display = 'none'; document.getElementById('2411.09486v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Automation in Construction, 2021 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09481">arXiv:2411.09481</a> <span> [<a href="https://arxiv.org/pdf/2411.09481">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> What makes a good BIM design: quantitative linking between design behavior and quality </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ni%2C+X">Xiang-Rui Ni</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+P">Peng Pan</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jia-Rui Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09481v1-abstract-short" style="display: inline;"> In the Architecture Engineering & Construction (AEC) industry, how design behaviors impact design quality remains unclear. This study proposes a novel approach, which, for the first time, identifies and quantitatively describes the relationship between design behaviors and quality of design based on Building Information Modeling (BIM). Real-time collection and log mining are integrated to collect… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09481v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09481v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09481v1-abstract-full" style="display: none;"> In the Architecture Engineering & Construction (AEC) industry, how design behaviors impact design quality remains unclear. This study proposes a novel approach, which, for the first time, identifies and quantitatively describes the relationship between design behaviors and quality of design based on Building Information Modeling (BIM). Real-time collection and log mining are integrated to collect raw data of design behaviors. Feature engineering and various machine learning models are then utilized for quantitative modeling and interpretation. Results confirm an existing quantifiable relationship which can be learned by various models. The best-performing model using Extremely Random Trees achieved an R2 value of 0.88 on the test set. Behavioral features related to designer's skill level and changes of design intentions are identified to have significant impacts on design quality. These findings deepen our understanding of the design process and help forming BIM designs with better quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09481v1-abstract-full').style.display = 'none'; document.getElementById('2411.09481v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09153">arXiv:2411.09153</a> <span> [<a href="https://arxiv.org/pdf/2411.09153">pdf</a>, <a href="https://arxiv.org/format/2411.09153">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> VidMan: Exploiting Implicit Dynamics from Video Diffusion Model for Effective Robot Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wen%2C+Y">Youpeng Wen</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Junfan Lin</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yi Zhu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jianhua Han</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Hang Xu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shen Zhao</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xiaodan Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09153v1-abstract-short" style="display: inline;"> Recent advancements utilizing large-scale video data for learning video generation models demonstrate significant potential in understanding complex physical dynamics. It suggests the feasibility of leveraging diverse robot trajectory data to develop a unified, dynamics-aware model to enhance robot manipulation. However, given the relatively small amount of available robot data, directly fitting d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09153v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09153v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09153v1-abstract-full" style="display: none;"> Recent advancements utilizing large-scale video data for learning video generation models demonstrate significant potential in understanding complex physical dynamics. It suggests the feasibility of leveraging diverse robot trajectory data to develop a unified, dynamics-aware model to enhance robot manipulation. However, given the relatively small amount of available robot data, directly fitting data without considering the relationship between visual observations and actions could lead to suboptimal data utilization. To this end, we propose VidMan (Video Diffusion for Robot Manipulation), a novel framework that employs a two-stage training mechanism inspired by dual-process theory from neuroscience to enhance stability and improve data utilization efficiency. Specifically, in the first stage, VidMan is pre-trained on the Open X-Embodiment dataset (OXE) for predicting future visual trajectories in a video denoising diffusion manner, enabling the model to develop a long horizontal awareness of the environment's dynamics. In the second stage, a flexible yet effective layer-wise self-attention adapter is introduced to transform VidMan into an efficient inverse dynamics model that predicts action modulated by the implicit dynamics knowledge via parameter sharing. Our VidMan framework outperforms state-of-the-art baseline model GR-1 on the CALVIN benchmark, achieving a 11.7% relative improvement, and demonstrates over 9% precision gains on the OXE small-scale dataset. These results provide compelling evidence that world models can significantly enhance the precision of robot action prediction. Codes and models will be public. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09153v1-abstract-full').style.display = 'none'; document.getElementById('2411.09153v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09121">arXiv:2411.09121</a> <span> [<a href="https://arxiv.org/pdf/2411.09121">pdf</a>, <a href="https://arxiv.org/format/2411.09121">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Logic in Computer Science">cs.LO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Formal Languages and Automata Theory">cs.FL</span> </div> </div> <p class="title is-5 mathjax"> AutoQ 2.0: From Verification of Quantum Circuits to Verification of Quantum Programs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yu-Fang Chen</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+K">Kai-Min Chung</a>, <a href="/search/cs?searchtype=author&query=Hsieh%2C+M">Min-Hsiu Hsieh</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wei-Jia Huang</a>, <a href="/search/cs?searchtype=author&query=Leng%C3%A1l%2C+O">Ond艡ej Leng谩l</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jyun-Ao Lin</a>, <a href="/search/cs?searchtype=author&query=Tsai%2C+W">Wei-Lun Tsai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09121v1-abstract-short" style="display: inline;"> We present a verifier of quantum programs called AutoQ 2.0. Quantum programs extend quantum circuits (the domain of AutoQ 1.0) by classical control flow constructs, which enable users to describe advanced quantum algorithms in a formal and precise manner. The extension is highly non-trivial, as we needed to tackle both theoretical challenges (such as the treatment of measurement, the normalization… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09121v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09121v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09121v1-abstract-full" style="display: none;"> We present a verifier of quantum programs called AutoQ 2.0. Quantum programs extend quantum circuits (the domain of AutoQ 1.0) by classical control flow constructs, which enable users to describe advanced quantum algorithms in a formal and precise manner. The extension is highly non-trivial, as we needed to tackle both theoretical challenges (such as the treatment of measurement, the normalization problem, and lifting techniques for verification of classical programs with loops to the quantum world), and engineering issues (such as extending the input format with a~support for specifying loop invariants). We have successfully used AutoQ 2.0 to verify two types of advanced quantum programs that cannot be expressed using only quantum circuits: the \emph{repeat-until-success} (RUS) algorithm and the weak-measurement-based version of Grover's search algorithm. AutoQ 2.0 can efficiently verify all our benchmarks: all RUS algorithms were verified instantly and, for the weak-measurement-based version of Grover's search, we were able to handle the case of 100 qubits in $\sim$20 minutes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09121v1-abstract-full').style.display = 'none'; document.getElementById('2411.09121v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">regular tool paper submitted to TACAS 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09116">arXiv:2411.09116</a> <span> [<a href="https://arxiv.org/pdf/2411.09116">pdf</a>, <a href="https://arxiv.org/format/2411.09116">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> P-MMEval: A Parallel Multilingual Multitask Benchmark for Consistent Evaluation of LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yidan Zhang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+B">Boyi Deng</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+Y">Yu Wan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+B">Baosong Yang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+H">Haoran Wei</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+F">Fei Huang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+B">Bowen Yu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Junyang Lin</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+F">Fei Huang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jingren Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09116v1-abstract-short" style="display: inline;"> Recent advancements in large language models (LLMs) showcase varied multilingual capabilities across tasks like translation, code generation, and reasoning. Previous assessments often limited their scope to fundamental natural language processing (NLP) or isolated capability-specific tasks. To alleviate this drawback, we aim to present a comprehensive multilingual multitask benchmark. First, we pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09116v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09116v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09116v1-abstract-full" style="display: none;"> Recent advancements in large language models (LLMs) showcase varied multilingual capabilities across tasks like translation, code generation, and reasoning. Previous assessments often limited their scope to fundamental natural language processing (NLP) or isolated capability-specific tasks. To alleviate this drawback, we aim to present a comprehensive multilingual multitask benchmark. First, we present a pipeline for selecting available and reasonable benchmarks from massive ones, addressing the oversight in previous work regarding the utility of these benchmarks, i.e., their ability to differentiate between models being evaluated. Leveraging this pipeline, we introduce P-MMEval, a large-scale benchmark covering effective fundamental and capability-specialized datasets. Furthermore, P-MMEval delivers consistent language coverage across various datasets and provides parallel samples. Finally, we conduct extensive experiments on representative multilingual model series to compare performances across models, analyze dataset effectiveness, examine prompt impacts on model performances, and explore the relationship between multilingual performances and factors such as tasks, model sizes, and languages. These insights offer valuable guidance for future research. The dataset is available at https://huggingface.co/datasets/Qwen/P-MMEval. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09116v1-abstract-full').style.display = 'none'; document.getElementById('2411.09116v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08275">arXiv:2411.08275</a> <span> [<a href="https://arxiv.org/pdf/2411.08275">pdf</a>, <a href="https://arxiv.org/format/2411.08275">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> A Large-Scale Study of Relevance Assessments with Large Language Models: An Initial Look </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Upadhyay%2C+S">Shivani Upadhyay</a>, <a href="/search/cs?searchtype=author&query=Pradeep%2C+R">Ronak Pradeep</a>, <a href="/search/cs?searchtype=author&query=Thakur%2C+N">Nandan Thakur</a>, <a href="/search/cs?searchtype=author&query=Campos%2C+D">Daniel Campos</a>, <a href="/search/cs?searchtype=author&query=Craswell%2C+N">Nick Craswell</a>, <a href="/search/cs?searchtype=author&query=Soboroff%2C+I">Ian Soboroff</a>, <a href="/search/cs?searchtype=author&query=Dang%2C+H+T">Hoa Trang Dang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jimmy Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08275v1-abstract-short" style="display: inline;"> The application of large language models to provide relevance assessments presents exciting opportunities to advance information retrieval, natural language processing, and beyond, but to date many unknowns remain. This paper reports on the results of a large-scale evaluation (the TREC 2024 RAG Track) where four different relevance assessment approaches were deployed in situ: the "standard" fully… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08275v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08275v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08275v1-abstract-full" style="display: none;"> The application of large language models to provide relevance assessments presents exciting opportunities to advance information retrieval, natural language processing, and beyond, but to date many unknowns remain. This paper reports on the results of a large-scale evaluation (the TREC 2024 RAG Track) where four different relevance assessment approaches were deployed in situ: the "standard" fully manual process that NIST has implemented for decades and three different alternatives that take advantage of LLMs to different extents using the open-source UMBRELA tool. This setup allows us to correlate system rankings induced by the different approaches to characterize tradeoffs between cost and quality. We find that in terms of nDCG@20, nDCG@100, and Recall@100, system rankings induced by automatically generated relevance assessments from UMBRELA correlate highly with those induced by fully manual assessments across a diverse set of 77 runs from 19 teams. Our results suggest that automatically generated UMBRELA judgments can replace fully manual judgments to accurately capture run-level effectiveness. Surprisingly, we find that LLM assistance does not appear to increase correlation with fully manual assessments, suggesting that costs associated with human-in-the-loop processes do not bring obvious tangible benefits. Overall, human assessors appear to be stricter than UMBRELA in applying relevance criteria. Our work validates the use of LLMs in academic TREC-style evaluations and provides the foundation for future studies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08275v1-abstract-full').style.display = 'none'; document.getElementById('2411.08275v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07183">arXiv:2411.07183</a> <span> [<a href="https://arxiv.org/pdf/2411.07183">pdf</a>, <a href="https://arxiv.org/format/2411.07183">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Probabilistic approach to feedback control enhances multi-legged locomotion on rugged landscapes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+J">Juntao He</a>, <a href="/search/cs?searchtype=author&query=Chong%2C+B">Baxi Chong</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jianfeng Lin</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhaochen Xu</a>, <a href="/search/cs?searchtype=author&query=Bagheri%2C+H">Hosain Bagheri</a>, <a href="/search/cs?searchtype=author&query=Flores%2C+E">Esteban Flores</a>, <a href="/search/cs?searchtype=author&query=Goldman%2C+D+I">Daniel I. Goldman</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07183v1-abstract-short" style="display: inline;"> Achieving robust legged locomotion on complex terrains poses challenges due to the high uncertainty in robot-environment interactions. Recent advances in bipedal and quadrupedal robots demonstrate good mobility on rugged terrains but rely heavily on sensors for stability due to low static stability from a high center of mass and a narrow base of support. We hypothesize that a multi-legged robotic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07183v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07183v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07183v1-abstract-full" style="display: none;"> Achieving robust legged locomotion on complex terrains poses challenges due to the high uncertainty in robot-environment interactions. Recent advances in bipedal and quadrupedal robots demonstrate good mobility on rugged terrains but rely heavily on sensors for stability due to low static stability from a high center of mass and a narrow base of support. We hypothesize that a multi-legged robotic system can leverage morphological redundancy from additional legs to minimize sensing requirements when traversing challenging terrains. Studies suggest that a multi-legged system with sufficient legs can reliably navigate noisy landscapes without sensing and control, albeit at a low speed of up to 0.1 body lengths per cycle (BLC). However, the control framework to enhance speed on challenging terrains remains underexplored due to the complex environmental interactions, making it difficult to identify the key parameters to control in these high-degree-of-freedom systems. Here, we present a bio-inspired vertical body undulation wave as a novel approach to mitigate environmental disturbances affecting robot speed, supported by experiments and probabilistic models. Finally, we introduce a control framework which monitors foot-ground contact patterns on rugose landscapes using binary foot-ground contact sensors to estimate terrain rugosity. The controller adjusts the vertical body wave based on the deviation of the limb's averaged actual-to-ideal foot-ground contact ratio, achieving a significant enhancement of up to 0.235 BLC on rugose laboratory terrain. We observed a $\sim$ 50\% increase in speed and a $\sim$ 40\% reduction in speed variance compared to the open-loop controller. Additionally, the controller operates in complex terrains outside the lab, including pine straw, robot-sized rocks, mud, and leaves. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07183v1-abstract-full').style.display = 'none'; document.getElementById('2411.07183v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to IEEE Transactions on Robotics (T-RO)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06666">arXiv:2411.06666</a> <span> [<a href="https://arxiv.org/pdf/2411.06666">pdf</a>, <a href="https://arxiv.org/format/2411.06666">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Adversarial Detection with a Dynamically Stable System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Long%2C+X">Xiaowei Long</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jie Lin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiangyuan Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06666v1-abstract-short" style="display: inline;"> Adversarial detection is designed to identify and reject maliciously crafted adversarial examples(AEs) which are generated to disrupt the classification of target models. Presently, various input transformation-based methods have been developed on adversarial example detection, which typically rely on empirical experience and lead to unreliability against new attacks. To address this issue, we… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06666v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06666v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06666v1-abstract-full" style="display: none;"> Adversarial detection is designed to identify and reject maliciously crafted adversarial examples(AEs) which are generated to disrupt the classification of target models. Presently, various input transformation-based methods have been developed on adversarial example detection, which typically rely on empirical experience and lead to unreliability against new attacks. To address this issue, we propose and conduct a Dynamically Stable System (DSS), which can effectively detect the adversarial examples from normal examples according to the stability of input examples. Particularly, in our paper, the generation of adversarial examples is considered as the perturbation process of a Lyapunov dynamic system, and we propose an example stability mechanism, in which a novel control term is added in adversarial example generation to ensure that the normal examples can achieve dynamic stability while the adversarial examples cannot achieve the stability. Then, based on the proposed example stability mechanism, a Dynamically Stable System (DSS) is proposed, which can utilize the disruption and restoration actions to determine the stability of input examples and detect the adversarial examples through changes in the stability of the input examples. In comparison with existing methods in three benchmark datasets(MNIST, CIFAR10, and CIFAR100), our evaluation results show that our proposed DSS can achieve ROC-AUC values of 99.83%, 97.81% and 94.47%, surpassing the state-of-the-art(SOTA) values of 97.35%, 91.10% and 93.49% in the other 7 methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06666v1-abstract-full').style.display = 'none'; document.getElementById('2411.06666v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05825">arXiv:2411.05825</a> <span> [<a href="https://arxiv.org/pdf/2411.05825">pdf</a>, <a href="https://arxiv.org/format/2411.05825">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SurfGNN: A robust surface-based prediction model with interpretability for coactivation maps of spatial and cortical features </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhuoshuo Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiong Zhang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Y">Youbing Zeng</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jiaying Lin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Dan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianjia Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+D">Duan Xu</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+H">Hosung Kim</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Bingguang Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Mengting Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05825v1-abstract-short" style="display: inline;"> Current brain surface-based prediction models often overlook the variability of regional attributes at the cortical feature level. While graph neural networks (GNNs) excel at capturing regional differences, they encounter challenges when dealing with complex, high-density graph structures. In this work, we consider the cortical surface mesh as a sparse graph and propose an interpretable prediction… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05825v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05825v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05825v1-abstract-full" style="display: none;"> Current brain surface-based prediction models often overlook the variability of regional attributes at the cortical feature level. While graph neural networks (GNNs) excel at capturing regional differences, they encounter challenges when dealing with complex, high-density graph structures. In this work, we consider the cortical surface mesh as a sparse graph and propose an interpretable prediction model-Surface Graph Neural Network (SurfGNN). SurfGNN employs topology-sampling learning (TSL) and region-specific learning (RSL) structures to manage individual cortical features at both lower and higher scales of the surface mesh, effectively tackling the challenges posed by the overly abundant mesh nodes and addressing the issue of heterogeneity in cortical regions. Building on this, a novel score-weighted fusion (SWF) method is implemented to merge nodal representations associated with each cortical feature for prediction. We apply our model to a neonatal brain age prediction task using a dataset of harmonized MR images from 481 subjects (503 scans). SurfGNN outperforms all existing state-of-the-art methods, demonstrating an improvement of at least 9.0% and achieving a mean absolute error (MAE) of 0.827+0.056 in postmenstrual weeks. Furthermore, it generates feature-level activation maps, indicating its capability to identify robust regional variations in different morphometric contributions for prediction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05825v1-abstract-full').style.display = 'none'; document.getElementById('2411.05825v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 6 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> J.3 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05508">arXiv:2411.05508</a> <span> [<a href="https://arxiv.org/pdf/2411.05508">pdf</a>, <a href="https://arxiv.org/format/2411.05508">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> An Early FIRST Reproduction and Improvements to Single-Token Decoding for Fast Listwise Reranking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zijian Chen</a>, <a href="/search/cs?searchtype=author&query=Pradeep%2C+R">Ronak Pradeep</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jimmy Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05508v2-abstract-short" style="display: inline;"> Recent advances have demonstrated that large language models (LLMs) excel as listwise rerankers, but their high computational demands remain a barrier to widespread adoption. Further, the traditional language modeling (LM) objective is not ideally suited for reranking tasks. FIRST is a novel approach that addresses these challenges by integrating a learning-to-rank objective and leveraging the log… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05508v2-abstract-full').style.display = 'inline'; document.getElementById('2411.05508v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05508v2-abstract-full" style="display: none;"> Recent advances have demonstrated that large language models (LLMs) excel as listwise rerankers, but their high computational demands remain a barrier to widespread adoption. Further, the traditional language modeling (LM) objective is not ideally suited for reranking tasks. FIRST is a novel approach that addresses these challenges by integrating a learning-to-rank objective and leveraging the logits of only the first generated token, thereby significantly reducing inference latency compared to traditional LLM rerankers. In this study, we extend the evaluation of FIRST to the TREC Deep Learning datasets (DL19-22), validating its robustness across diverse domains. We investigate the influence of different first-stage retrievers on FIRST rerankers, observing diminishing returns and patterns consistent with traditional LLM rerankers. Through applying the FIRST objective to a broader range of backbone models, we achieve effectiveness surpassing the original implementation. Our experiments confirm that fast reranking with single-token logits does not compromise out-of-domain reranking quality. To better quantify the computational savings in the original study, we measure and compare latency to find a 21%-42% gain across various models and benchmarks. Moreover, while LM training implicitly improves zero-shot single-token reranking, our experiments also raise questions about whether LM pre-training may hinder subsequent fine-tuning with the FIRST objective. These findings pave the way for more efficient and effective listwise reranking in future applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05508v2-abstract-full').style.display = 'none'; document.getElementById('2411.05508v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05361">arXiv:2411.05361</a> <span> [<a href="https://arxiv.org/pdf/2411.05361">pdf</a>, <a href="https://arxiv.org/format/2411.05361">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Dynamic-SUPERB Phase-2: A Collaboratively Expanding Benchmark for Measuring the Capabilities of Spoken Language Models with 180 Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+C">Chien-yu Huang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wei-Chih Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shu-wen Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+A+T">Andy T. Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chen-An Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yu-Xiang Lin</a>, <a href="/search/cs?searchtype=author&query=Tseng%2C+W">Wei-Cheng Tseng</a>, <a href="/search/cs?searchtype=author&query=Diwan%2C+A">Anuj Diwan</a>, <a href="/search/cs?searchtype=author&query=Shih%2C+Y">Yi-Jen Shih</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jiatong Shi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">William Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xuanjun Chen</a>, <a href="/search/cs?searchtype=author&query=Hsiao%2C+C">Chi-Yuan Hsiao</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+P">Puyuan Peng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shih-Heng Wang</a>, <a href="/search/cs?searchtype=author&query=Kuan%2C+C">Chun-Yi Kuan</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+K">Ke-Han Lu</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kai-Wei Chang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chih-Kai Yang</a>, <a href="/search/cs?searchtype=author&query=Ritter-Gutierrez%2C+F">Fabian Ritter-Gutierrez</a>, <a href="/search/cs?searchtype=author&query=Chuang%2C+M+T">Ming To Chuang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kuan-Po Huang</a>, <a href="/search/cs?searchtype=author&query=Arora%2C+S">Siddhant Arora</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">You-Kuan Lin</a>, <a href="/search/cs?searchtype=author&query=Yeo%2C+E">Eunjung Yeo</a> , et al. (53 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05361v1-abstract-short" style="display: inline;"> Multimodal foundation models, such as Gemini and ChatGPT, have revolutionized human-machine interactions by seamlessly integrating various forms of data. Developing a universal spoken language model that comprehends a wide range of natural language instructions is critical for bridging communication gaps and facilitating more intuitive interactions. However, the absence of a comprehensive evaluati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05361v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05361v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05361v1-abstract-full" style="display: none;"> Multimodal foundation models, such as Gemini and ChatGPT, have revolutionized human-machine interactions by seamlessly integrating various forms of data. Developing a universal spoken language model that comprehends a wide range of natural language instructions is critical for bridging communication gaps and facilitating more intuitive interactions. However, the absence of a comprehensive evaluation benchmark poses a significant challenge. We present Dynamic-SUPERB Phase-2, an open and evolving benchmark for the comprehensive evaluation of instruction-based universal speech models. Building upon the first generation, this second version incorporates 125 new tasks contributed collaboratively by the global research community, expanding the benchmark to a total of 180 tasks, making it the largest benchmark for speech and audio evaluation. While the first generation of Dynamic-SUPERB was limited to classification tasks, Dynamic-SUPERB Phase-2 broadens its evaluation capabilities by introducing a wide array of novel and diverse tasks, including regression and sequence generation, across speech, music, and environmental audio. Evaluation results indicate that none of the models performed well universally. SALMONN-13B excelled in English ASR, while WavLLM demonstrated high accuracy in emotion recognition, but current models still require further innovations to handle a broader range of tasks. We will soon open-source all task data and the evaluation pipeline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05361v1-abstract-full').style.display = 'none'; document.getElementById('2411.05361v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05194">arXiv:2411.05194</a> <span> [<a href="https://arxiv.org/pdf/2411.05194">pdf</a>, <a href="https://arxiv.org/format/2411.05194">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Interactive Dialogue Agents via Reinforcement Learning on Hindsight Regenerations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hong%2C+J">Joey Hong</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jessica Lin</a>, <a href="/search/cs?searchtype=author&query=Dragan%2C+A">Anca Dragan</a>, <a href="/search/cs?searchtype=author&query=Levine%2C+S">Sergey Levine</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05194v1-abstract-short" style="display: inline;"> Recent progress on large language models (LLMs) has enabled dialogue agents to generate highly naturalistic and plausible text. However, current LLM language generation focuses on responding accurately to questions and requests with a single effective response. In reality, many real dialogues are interactive, meaning an agent's utterances will influence their conversational partner, elicit informa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05194v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05194v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05194v1-abstract-full" style="display: none;"> Recent progress on large language models (LLMs) has enabled dialogue agents to generate highly naturalistic and plausible text. However, current LLM language generation focuses on responding accurately to questions and requests with a single effective response. In reality, many real dialogues are interactive, meaning an agent's utterances will influence their conversational partner, elicit information, or change their opinion. Accounting for how an agent can effectively steer a conversation is a crucial ability in many dialogue tasks, from healthcare to preference elicitation. Existing methods for fine-tuning dialogue agents to accomplish such tasks would rely on curating some amount of expert data. However, doing so often requires understanding the underlying cognitive processes of the conversational partner, which is a skill neither humans nor LLMs trained on human data can reliably do. Our key insight is that while LLMs may not be adept at identifying effective strategies for steering conversations a priori, or in the middle of an ongoing conversation, they can do so post-hoc, or in hindsight, after seeing how their conversational partner responds. We use this fact to rewrite and augment existing suboptimal data, and train via offline reinforcement learning (RL) an agent that outperforms both prompting and learning from unaltered human demonstrations. We apply our approach to two domains that require understanding human mental state, intelligent interaction, and persuasion: mental health support, and soliciting charitable donations. Our results in a user study with real humans show that our approach greatly outperforms existing state-of-the-art dialogue agents. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05194v1-abstract-full').style.display = 'none'; document.getElementById('2411.05194v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04530">arXiv:2411.04530</a> <span> [<a href="https://arxiv.org/pdf/2411.04530">pdf</a>, <a href="https://arxiv.org/format/2411.04530">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Tomato, Tomahto, Tomate: Measuring the Role of Shared Semantics among Subwords in Multilingual Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xinyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jing Lu</a>, <a href="/search/cs?searchtype=author&query=Tran%2C+V+Q">Vinh Q. Tran</a>, <a href="/search/cs?searchtype=author&query=Schuster%2C+T">Tal Schuster</a>, <a href="/search/cs?searchtype=author&query=Metzler%2C+D">Donald Metzler</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jimmy Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04530v1-abstract-short" style="display: inline;"> Human understanding of language is robust to different word choices as far as they represent similar semantic concepts. To what extent does our human intuition transfer to language models, which represent all subwords as distinct embeddings? In this work, we take an initial step on measuring the role of shared semantics among subwords in the encoder-only multilingual language models (mLMs). To thi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04530v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04530v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04530v1-abstract-full" style="display: none;"> Human understanding of language is robust to different word choices as far as they represent similar semantic concepts. To what extent does our human intuition transfer to language models, which represent all subwords as distinct embeddings? In this work, we take an initial step on measuring the role of shared semantics among subwords in the encoder-only multilingual language models (mLMs). To this end, we form "semantic tokens" by merging the semantically similar subwords and their embeddings, and evaluate the updated mLMs on 5 heterogeneous multilingual downstream tasks. Results show that the general shared semantics could get the models a long way in making the predictions on mLMs with different tokenizers and model sizes. Inspections on the grouped subwords show that they exhibit a wide range of semantic similarities, including synonyms and translations across many languages and scripts. Lastly, we found the zero-shot results with semantic tokens are on par or even better than the original models on certain classification tasks, suggesting that the shared subword-level semantics may serve as the anchors for cross-lingual transferring. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04530v1-abstract-full').style.display = 'none'; document.getElementById('2411.04530v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03628">arXiv:2411.03628</a> <span> [<a href="https://arxiv.org/pdf/2411.03628">pdf</a>, <a href="https://arxiv.org/format/2411.03628">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> StreamingBench: Assessing the Gap for MLLMs to Achieve Streaming Video Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+J">Junming Lin</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Z">Zheng Fang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chi Chen</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+Z">Zihao Wan</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+F">Fuwen Luo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Peng Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+M">Maosong Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03628v1-abstract-short" style="display: inline;"> The rapid development of Multimodal Large Language Models (MLLMs) has expanded their capabilities from image comprehension to video understanding. However, most of these MLLMs focus primarily on offline video comprehension, necessitating extensive processing of all video frames before any queries can be made. This presents a significant gap compared to the human ability to watch, listen, think, an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03628v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03628v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03628v1-abstract-full" style="display: none;"> The rapid development of Multimodal Large Language Models (MLLMs) has expanded their capabilities from image comprehension to video understanding. However, most of these MLLMs focus primarily on offline video comprehension, necessitating extensive processing of all video frames before any queries can be made. This presents a significant gap compared to the human ability to watch, listen, think, and respond to streaming inputs in real time, highlighting the limitations of current MLLMs. In this paper, we introduce StreamingBench, the first comprehensive benchmark designed to evaluate the streaming video understanding capabilities of MLLMs. StreamingBench assesses three core aspects of streaming video understanding: (1) real-time visual understanding, (2) omni-source understanding, and (3) contextual understanding. The benchmark consists of 18 tasks, featuring 900 videos and 4,500 human-curated QA pairs. Each video features five questions presented at different time points to simulate a continuous streaming scenario. We conduct experiments on StreamingBench with 13 open-source and proprietary MLLMs and find that even the most advanced proprietary MLLMs like Gemini 1.5 Pro and GPT-4o perform significantly below human-level streaming video understanding capabilities. We hope our work can facilitate further advancements for MLLMs, empowering them to approach human-level video comprehension and interaction in more realistic scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03628v1-abstract-full').style.display = 'none'; document.getElementById('2411.03628v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03291">arXiv:2411.03291</a> <span> [<a href="https://arxiv.org/pdf/2411.03291">pdf</a>, <a href="https://arxiv.org/format/2411.03291">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Using Assurance Cases to Guide Verification and Validation of Research Software </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Smith%2C+W+S">W. Spencer Smith</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jingyi Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03291v1-abstract-short" style="display: inline;"> Research software engineers can use Assurance Cases (ACs) to guide Verification and Validation (VnV) efforts. An AC is a structured argument that a property like correctness holds. We illustrate how ACs can guide VnV activities via a case study of software for automatically extracting the 3D segmentation of the aorta from medical images of the chest. The AC argument suggests that the following evi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03291v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03291v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03291v1-abstract-full" style="display: none;"> Research software engineers can use Assurance Cases (ACs) to guide Verification and Validation (VnV) efforts. An AC is a structured argument that a property like correctness holds. We illustrate how ACs can guide VnV activities via a case study of software for automatically extracting the 3D segmentation of the aorta from medical images of the chest. The AC argument suggests that the following evidence is required: comparison to a pseudo-oracle; traceability between requirements, design, code and tests; review of all artifacts by a domain expert with proper credentials; documentation of input assumptions; and a warning that only qualified people should use the software. The case study highlights that code is not the only artifact of interest for building confidence and that making an explicit distinction between software and user responsibilities is useful. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03291v1-abstract-full').style.display = 'none'; document.getElementById('2411.03291v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02571">arXiv:2411.02571</a> <span> [<a href="https://arxiv.org/pdf/2411.02571">pdf</a>, <a href="https://arxiv.org/format/2411.02571">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> MM-Embed: Universal Multimodal Retrieval with Multimodal LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+S">Sheng-Chieh Lin</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+C">Chankyu Lee</a>, <a href="/search/cs?searchtype=author&query=Shoeybi%2C+M">Mohammad Shoeybi</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jimmy Lin</a>, <a href="/search/cs?searchtype=author&query=Catanzaro%2C+B">Bryan Catanzaro</a>, <a href="/search/cs?searchtype=author&query=Ping%2C+W">Wei Ping</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02571v1-abstract-short" style="display: inline;"> State-of-the-art retrieval models typically address a straightforward search scenario, where retrieval tasks are fixed (e.g., finding a passage to answer a specific question) and only a single modality is supported for both queries and retrieved results. This paper introduces techniques for advancing information retrieval with multimodal large language models (MLLMs), enabling a broader search sce… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02571v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02571v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02571v1-abstract-full" style="display: none;"> State-of-the-art retrieval models typically address a straightforward search scenario, where retrieval tasks are fixed (e.g., finding a passage to answer a specific question) and only a single modality is supported for both queries and retrieved results. This paper introduces techniques for advancing information retrieval with multimodal large language models (MLLMs), enabling a broader search scenario, termed universal multimodal retrieval, where multiple modalities and diverse retrieval tasks are accommodated. To this end, we first study fine-tuning an MLLM as a bi-encoder retriever on 10 datasets with 16 retrieval tasks. Our empirical results show that the fine-tuned MLLM retriever is capable of understanding challenging queries, composed of both text and image, but underperforms a smaller CLIP retriever in cross-modal retrieval tasks due to modality bias from MLLMs. To address the issue, we propose modality-aware hard negative mining to mitigate the modality bias exhibited by MLLM retrievers. Second, we propose to continually fine-tune the universal multimodal retriever to enhance its text retrieval capability while maintaining multimodal retrieval capability. As a result, our model, MM-Embed, achieves state-of-the-art performance on the multimodal retrieval benchmark M-BEIR, which spans multiple domains and tasks, while also surpassing the state-of-the-art text retrieval model, NV-Embed-v1, on MTEB retrieval benchmark. Finally, we explore to prompt the off-the-shelf MLLMs as the zero-shot rerankers to refine the ranking of the candidates from the multimodal retriever. We find that through prompt-and-reranking, MLLMs can further improve multimodal retrieval when the user queries (e.g., text-image composed queries) are more complex and challenging to understand. These findings also pave the way to advance universal multimodal retrieval in the future. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02571v1-abstract-full').style.display = 'none'; document.getElementById('2411.02571v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">We release the model weights at: https://huggingface.co/nvidia/MM-Embed</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02435">arXiv:2411.02435</a> <span> [<a href="https://arxiv.org/pdf/2411.02435">pdf</a>, <a href="https://arxiv.org/format/2411.02435">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Narrative Analysis of True Crime Podcasts With Knowledge Graph-Augmented Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Leng%2C+X">Xinyi Leng</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+J">Jason Liang</a>, <a href="/search/cs?searchtype=author&query=Mauro%2C+J">Jack Mauro</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xu Wang</a>, <a href="/search/cs?searchtype=author&query=Bertozzi%2C+A+L">Andrea L. Bertozzi</a>, <a href="/search/cs?searchtype=author&query=Chapman%2C+J">James Chapman</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Junyuan Lin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bohan Chen</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+C">Chenchen Ye</a>, <a href="/search/cs?searchtype=author&query=Daniel%2C+T">Temple Daniel</a>, <a href="/search/cs?searchtype=author&query=Brantingham%2C+P+J">P. Jeffrey Brantingham</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02435v1-abstract-short" style="display: inline;"> Narrative data spans all disciplines and provides a coherent model of the world to the reader or viewer. Recent advancement in machine learning and Large Language Models (LLMs) have enable great strides in analyzing natural language. However, Large language models (LLMs) still struggle with complex narrative arcs as well as narratives containing conflicting information. Recent work indicates LLMs… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02435v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02435v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02435v1-abstract-full" style="display: none;"> Narrative data spans all disciplines and provides a coherent model of the world to the reader or viewer. Recent advancement in machine learning and Large Language Models (LLMs) have enable great strides in analyzing natural language. However, Large language models (LLMs) still struggle with complex narrative arcs as well as narratives containing conflicting information. Recent work indicates LLMs augmented with external knowledge bases can improve the accuracy and interpretability of the resulting models. In this work, we analyze the effectiveness of applying knowledge graphs (KGs) in understanding true-crime podcast data from both classical Natural Language Processing (NLP) and LLM approaches. We directly compare KG-augmented LLMs (KGLLMs) with classical methods for KG construction, topic modeling, and sentiment analysis. Additionally, the KGLLM allows us to query the knowledge base in natural language and test its ability to factually answer questions. We examine the robustness of the model to adversarial prompting in order to test the model's ability to deal with conflicting information. Finally, we apply classical methods to understand more subtle aspects of the text such as the use of hearsay and sentiment in narrative construction and propose future directions. Our results indicate that KGLLMs outperform LLMs on a variety of metrics, are more robust to adversarial prompts, and are more capable of summarizing the text into topics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02435v1-abstract-full').style.display = 'none'; document.getElementById('2411.02435v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 Pages, 3 Figures, GTA3 Workshop-2024, October 2024, 33rd International Conference on Information and Knowledge Management, Boise, Idaho, USA</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00491">arXiv:2411.00491</a> <span> [<a href="https://arxiv.org/pdf/2411.00491">pdf</a>, <a href="https://arxiv.org/format/2411.00491">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> GDTB: Genre Diverse Data for English Shallow Discourse Parsing across Modalities, Text Types, and Domains </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y+J">Yang Janet Liu</a>, <a href="/search/cs?searchtype=author&query=Aoyama%2C+T">Tatsuya Aoyama</a>, <a href="/search/cs?searchtype=author&query=Scivetti%2C+W">Wesley Scivetti</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yilun Zhu</a>, <a href="/search/cs?searchtype=author&query=Behzad%2C+S">Shabnam Behzad</a>, <a href="/search/cs?searchtype=author&query=Levine%2C+L+E">Lauren Elizabeth Levine</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jessica Lin</a>, <a href="/search/cs?searchtype=author&query=Tiwari%2C+D">Devika Tiwari</a>, <a href="/search/cs?searchtype=author&query=Zeldes%2C+A">Amir Zeldes</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00491v1-abstract-short" style="display: inline;"> Work on shallow discourse parsing in English has focused on the Wall Street Journal corpus, the only large-scale dataset for the language in the PDTB framework. However, the data is not openly available, is restricted to the news domain, and is by now 35 years old. In this paper, we present and evaluate a new open-access, multi-genre benchmark for PDTB-style shallow discourse parsing, based on the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00491v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00491v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00491v1-abstract-full" style="display: none;"> Work on shallow discourse parsing in English has focused on the Wall Street Journal corpus, the only large-scale dataset for the language in the PDTB framework. However, the data is not openly available, is restricted to the news domain, and is by now 35 years old. In this paper, we present and evaluate a new open-access, multi-genre benchmark for PDTB-style shallow discourse parsing, based on the existing UD English GUM corpus, for which discourse relation annotations in other frameworks already exist. In a series of experiments on cross-domain relation classification, we show that while our dataset is compatible with PDTB, substantial out-of-domain degradation is observed, which can be alleviated by joint training on both datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00491v1-abstract-full').style.display = 'none'; document.getElementById('2411.00491v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to EMNLP 2024 (main, long); camera-ready version</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23933">arXiv:2410.23933</a> <span> [<a href="https://arxiv.org/pdf/2410.23933">pdf</a>, <a href="https://arxiv.org/format/2410.23933">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Language Models can Self-Lengthen to Generate Long Texts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Quan%2C+S">Shanghaoran Quan</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+T">Tianyi Tang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+B">Bowen Yu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+A">An Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dayiheng Liu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+B">Bofei Gao</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+J">Jianhong Tu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yichang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jingren Zhou</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Junyang Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23933v1-abstract-short" style="display: inline;"> Recent advancements in Large Language Models (LLMs) have significantly enhanced their ability to process long contexts, yet a notable gap remains in generating long, aligned outputs. This limitation stems from a training gap where pre-training lacks effective instructions for long-text generation, and post-training data primarily consists of short query-response pairs. Current approaches, such as… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23933v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23933v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23933v1-abstract-full" style="display: none;"> Recent advancements in Large Language Models (LLMs) have significantly enhanced their ability to process long contexts, yet a notable gap remains in generating long, aligned outputs. This limitation stems from a training gap where pre-training lacks effective instructions for long-text generation, and post-training data primarily consists of short query-response pairs. Current approaches, such as instruction backtranslation and behavior imitation, face challenges including data quality, copyright issues, and constraints on proprietary model usage. In this paper, we introduce an innovative iterative training framework called Self-Lengthen that leverages only the intrinsic knowledge and skills of LLMs without the need for auxiliary data or proprietary models. The framework consists of two roles: the Generator and the Extender. The Generator produces the initial response, which is then split and expanded by the Extender. This process results in a new, longer response, which is used to train both the Generator and the Extender iteratively. Through this process, the models are progressively trained to handle increasingly longer responses. Experiments on benchmarks and human evaluations show that Self-Lengthen outperforms existing methods in long-text generation, when applied to top open-source LLMs such as Qwen2 and LLaMA3. Our code is publicly available at https://github.com/QwenLM/Self-Lengthen. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23933v1-abstract-full').style.display = 'none'; document.getElementById('2410.23933v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22364">arXiv:2410.22364</a> <span> [<a href="https://arxiv.org/pdf/2410.22364">pdf</a>, <a href="https://arxiv.org/format/2410.22364">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Accelerating Augmentation Invariance Pretraining </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jinhong Lin</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Cheng-En Wu</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Y">Yibing Wei</a>, <a href="/search/cs?searchtype=author&query=Morgado%2C+P">Pedro Morgado</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22364v2-abstract-short" style="display: inline;"> Our work tackles the computational challenges of contrastive learning methods, particularly for the pretraining of Vision Transformers (ViTs). Despite the effectiveness of contrastive learning, the substantial computational resources required for training often hinder their practical application. To mitigate this issue, we propose an acceleration framework, leveraging ViT's unique ability to gener… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22364v2-abstract-full').style.display = 'inline'; document.getElementById('2410.22364v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22364v2-abstract-full" style="display: none;"> Our work tackles the computational challenges of contrastive learning methods, particularly for the pretraining of Vision Transformers (ViTs). Despite the effectiveness of contrastive learning, the substantial computational resources required for training often hinder their practical application. To mitigate this issue, we propose an acceleration framework, leveraging ViT's unique ability to generalize across inputs of varying sequence lengths. Our method employs a mix of sequence compression strategies, including randomized token dropout and flexible patch scaling, to reduce the cost of gradient estimation and accelerate convergence. We further provide an in-depth analysis of the gradient estimation error of various acceleration strategies as well as their impact on downstream tasks, offering valuable insights into the trade-offs between acceleration and performance. We also propose a novel procedure to identify an optimal acceleration schedule to adjust the sequence compression ratios to the training progress, ensuring efficient training without sacrificing downstream performance. Our approach significantly reduces computational overhead across various self-supervised learning algorithms on large-scale datasets. In ImageNet, our method achieves speedups of 4$\times$ in MoCo, 3.3$\times$ in SimCLR, and 2.5$\times$ in DINO, demonstrating substantial efficiency gains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22364v2-abstract-full').style.display = 'none'; document.getElementById('2410.22364v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.21276">arXiv:2410.21276</a> <span> [<a href="https://arxiv.org/pdf/2410.21276">pdf</a>, <a href="https://arxiv.org/format/2410.21276">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> GPT-4o System Card </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=OpenAI"> OpenAI</a>, <a href="/search/cs?searchtype=author&query=%3A"> :</a>, <a href="/search/cs?searchtype=author&query=Hurst%2C+A">Aaron Hurst</a>, <a href="/search/cs?searchtype=author&query=Lerer%2C+A">Adam Lerer</a>, <a href="/search/cs?searchtype=author&query=Goucher%2C+A+P">Adam P. Goucher</a>, <a href="/search/cs?searchtype=author&query=Perelman%2C+A">Adam Perelman</a>, <a href="/search/cs?searchtype=author&query=Ramesh%2C+A">Aditya Ramesh</a>, <a href="/search/cs?searchtype=author&query=Clark%2C+A">Aidan Clark</a>, <a href="/search/cs?searchtype=author&query=Ostrow%2C+A">AJ Ostrow</a>, <a href="/search/cs?searchtype=author&query=Welihinda%2C+A">Akila Welihinda</a>, <a href="/search/cs?searchtype=author&query=Hayes%2C+A">Alan Hayes</a>, <a href="/search/cs?searchtype=author&query=Radford%2C+A">Alec Radford</a>, <a href="/search/cs?searchtype=author&query=M%C4%85dry%2C+A">Aleksander M膮dry</a>, <a href="/search/cs?searchtype=author&query=Baker-Whitcomb%2C+A">Alex Baker-Whitcomb</a>, <a href="/search/cs?searchtype=author&query=Beutel%2C+A">Alex Beutel</a>, <a href="/search/cs?searchtype=author&query=Borzunov%2C+A">Alex Borzunov</a>, <a href="/search/cs?searchtype=author&query=Carney%2C+A">Alex Carney</a>, <a href="/search/cs?searchtype=author&query=Chow%2C+A">Alex Chow</a>, <a href="/search/cs?searchtype=author&query=Kirillov%2C+A">Alex Kirillov</a>, <a href="/search/cs?searchtype=author&query=Nichol%2C+A">Alex Nichol</a>, <a href="/search/cs?searchtype=author&query=Paino%2C+A">Alex Paino</a>, <a href="/search/cs?searchtype=author&query=Renzin%2C+A">Alex Renzin</a>, <a href="/search/cs?searchtype=author&query=Passos%2C+A+T">Alex Tachard Passos</a>, <a href="/search/cs?searchtype=author&query=Kirillov%2C+A">Alexander Kirillov</a>, <a href="/search/cs?searchtype=author&query=Christakis%2C+A">Alexi Christakis</a> , et al. (395 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.21276v1-abstract-short" style="display: inline;"> GPT-4o is an autoregressive omni model that accepts as input any combination of text, audio, image, and video, and generates any combination of text, audio, and image outputs. It's trained end-to-end across text, vision, and audio, meaning all inputs and outputs are processed by the same neural network. GPT-4o can respond to audio inputs in as little as 232 milliseconds, with an average of 320 mil… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21276v1-abstract-full').style.display = 'inline'; document.getElementById('2410.21276v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.21276v1-abstract-full" style="display: none;"> GPT-4o is an autoregressive omni model that accepts as input any combination of text, audio, image, and video, and generates any combination of text, audio, and image outputs. It's trained end-to-end across text, vision, and audio, meaning all inputs and outputs are processed by the same neural network. GPT-4o can respond to audio inputs in as little as 232 milliseconds, with an average of 320 milliseconds, which is similar to human response time in conversation. It matches GPT-4 Turbo performance on text in English and code, with significant improvement on text in non-English languages, while also being much faster and 50\% cheaper in the API. GPT-4o is especially better at vision and audio understanding compared to existing models. In line with our commitment to building AI safely and consistent with our voluntary commitments to the White House, we are sharing the GPT-4o System Card, which includes our Preparedness Framework evaluations. In this System Card, we provide a detailed look at GPT-4o's capabilities, limitations, and safety evaluations across multiple categories, focusing on speech-to-speech while also evaluating text and image capabilities, and measures we've implemented to ensure the model is safe and aligned. We also include third-party assessments on dangerous capabilities, as well as discussion of potential societal impacts of GPT-4o's text and vision capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.21276v1-abstract-full').style.display = 'none'; document.getElementById('2410.21276v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20778">arXiv:2410.20778</a> <span> [<a href="https://arxiv.org/pdf/2410.20778">pdf</a>, <a href="https://arxiv.org/format/2410.20778">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Beyond Positive History: Re-ranking with List-level Hybrid Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Weng%2C+M">Muyan Weng</a>, <a href="/search/cs?searchtype=author&query=Xi%2C+Y">Yunjia Xi</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Weiwen Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bo Chen</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jianghao Lin</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+R">Ruiming Tang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weinan Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yong Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20778v1-abstract-short" style="display: inline;"> As the last stage of recommender systems, re-ranking generates a re-ordered list that aligns with the user's preference. However, previous works generally focus on item-level positive feedback as history (e.g., only clicked items) and ignore that users provide positive or negative feedback on items in the entire list. This list-level hybrid feedback can reveal users' holistic preferences and refle… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20778v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20778v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20778v1-abstract-full" style="display: none;"> As the last stage of recommender systems, re-ranking generates a re-ordered list that aligns with the user's preference. However, previous works generally focus on item-level positive feedback as history (e.g., only clicked items) and ignore that users provide positive or negative feedback on items in the entire list. This list-level hybrid feedback can reveal users' holistic preferences and reflect users' comparison behavior patterns manifesting within a list. Such patterns could predict user behaviors on candidate lists, thus aiding better re-ranking. Despite appealing benefits, extracting and integrating preferences and behavior patterns from list-level hybrid feedback into re-ranking multiple items remains challenging. To this end, we propose Re-ranking with List-level Hybrid Feedback (dubbed RELIFE). It captures user's preferences and behavior patterns with three modules: a Disentangled Interest Miner to disentangle the user's preferences into interests and disinterests, a Sequential Preference Mixer to learn users' entangled preferences considering the context of feedback, and a Comparison-aware Pattern Extractor to capture user's behavior patterns within each list. Moreover, for better integration of patterns, contrastive learning is adopted to align the behavior patterns of candidate and historical lists. Extensive experiments show that RELIFE significantly outperforms SOTA re-ranking baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20778v1-abstract-full').style.display = 'none'; document.getElementById('2410.20778v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20746">arXiv:2410.20746</a> <span> [<a href="https://arxiv.org/pdf/2410.20746">pdf</a>, <a href="https://arxiv.org/format/2410.20746">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> ElectionSim: Massive Population Election Simulation Powered by Large Language Model Driven Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xinnong Zhang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jiayu Lin</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+L">Libo Sun</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+W">Weihong Qi</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yihang Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yue Chen</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+H">Hanjia Lyu</a>, <a href="/search/cs?searchtype=author&query=Mou%2C+X">Xinyi Mou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Siming Chen</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+J">Jiebo Luo</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xuanjing Huang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+S">Shiping Tang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Z">Zhongyu Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20746v3-abstract-short" style="display: inline;"> The massive population election simulation aims to model the preferences of specific groups in particular election scenarios. It has garnered significant attention for its potential to forecast real-world social trends. Traditional agent-based modeling (ABM) methods are constrained by their ability to incorporate complex individual background information and provide interactive prediction results.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20746v3-abstract-full').style.display = 'inline'; document.getElementById('2410.20746v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20746v3-abstract-full" style="display: none;"> The massive population election simulation aims to model the preferences of specific groups in particular election scenarios. It has garnered significant attention for its potential to forecast real-world social trends. Traditional agent-based modeling (ABM) methods are constrained by their ability to incorporate complex individual background information and provide interactive prediction results. In this paper, we introduce ElectionSim, an innovative election simulation framework based on large language models, designed to support accurate voter simulations and customized distributions, together with an interactive platform to dialogue with simulated voters. We present a million-level voter pool sampled from social media platforms to support accurate individual simulation. We also introduce PPE, a poll-based presidential election benchmark to assess the performance of our framework under the U.S. presidential election scenario. Through extensive experiments and analyses, we demonstrate the effectiveness and robustness of our framework in U.S. presidential election simulations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20746v3-abstract-full').style.display = 'none'; document.getElementById('2410.20746v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">42 pages, 14 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19845">arXiv:2410.19845</a> <span> [<a href="https://arxiv.org/pdf/2410.19845">pdf</a>, <a href="https://arxiv.org/ps/2410.19845">ps</a>, <a href="https://arxiv.org/format/2410.19845">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Trust and Safety in Digital Payments: An LLM-Powered Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dahiphale%2C+D">Devendra Dahiphale</a>, <a href="/search/cs?searchtype=author&query=Madiraju%2C+N">Naveen Madiraju</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Justin Lin</a>, <a href="/search/cs?searchtype=author&query=Karve%2C+R">Rutvik Karve</a>, <a href="/search/cs?searchtype=author&query=Agrawal%2C+M">Monu Agrawal</a>, <a href="/search/cs?searchtype=author&query=Modwal%2C+A">Anant Modwal</a>, <a href="/search/cs?searchtype=author&query=Balakrishnan%2C+R">Ramanan Balakrishnan</a>, <a href="/search/cs?searchtype=author&query=Shah%2C+S">Shanay Shah</a>, <a href="/search/cs?searchtype=author&query=Kaushal%2C+G">Govind Kaushal</a>, <a href="/search/cs?searchtype=author&query=Mandawat%2C+P">Priya Mandawat</a>, <a href="/search/cs?searchtype=author&query=Hariramani%2C+P">Prakash Hariramani</a>, <a href="/search/cs?searchtype=author&query=Merchant%2C+A">Arif Merchant</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19845v1-abstract-short" style="display: inline;"> Digital payment systems have revolutionized financial transactions, offering unparalleled convenience and accessibility to users worldwide. However, the increasing popularity of these platforms has also attracted malicious actors seeking to exploit their vulnerabilities for financial gain. To address this challenge, robust and adaptable scam detection mechanisms are crucial for maintaining the tru… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19845v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19845v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19845v1-abstract-full" style="display: none;"> Digital payment systems have revolutionized financial transactions, offering unparalleled convenience and accessibility to users worldwide. However, the increasing popularity of these platforms has also attracted malicious actors seeking to exploit their vulnerabilities for financial gain. To address this challenge, robust and adaptable scam detection mechanisms are crucial for maintaining the trust and safety of digital payment ecosystems. This paper presents a comprehensive approach to scam detection, focusing on the Unified Payments Interface (UPI) in India, Google Pay (GPay) as a specific use case. The approach leverages Large Language Models (LLMs) to enhance scam classification accuracy and designs a digital assistant to aid human reviewers in identifying and mitigating fraudulent activities. The results demonstrate the potential of LLMs in augmenting existing machine learning models and improving the efficiency, accuracy, quality, and consistency of scam reviews, ultimately contributing to a safer and more secure digital payment landscape. Our evaluation of the Gemini Ultra model on curated transaction data showed a 93.33% accuracy in scam classification. Furthermore, the model demonstrated 89% accuracy in generating reasoning for these classifications. A promising fact, the model identified 32% new accurate reasons for suspected scams that human reviewers had not included in the review notes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19845v1-abstract-full').style.display = 'none'; document.getElementById('2410.19845v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19346">arXiv:2410.19346</a> <span> [<a href="https://arxiv.org/pdf/2410.19346">pdf</a>, <a href="https://arxiv.org/format/2410.19346">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> AgentSense: Benchmarking Social Intelligence of Language Agents through Interactive Scenarios </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mou%2C+X">Xinyi Mou</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+J">Jingcong Liang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jiayu Lin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xinnong Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiawei Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shiyue Yang</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+R">Rong Ye</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lei Chen</a>, <a href="/search/cs?searchtype=author&query=Kuang%2C+H">Haoyu Kuang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xuanjing Huang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Z">Zhongyu Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19346v1-abstract-short" style="display: inline;"> Large language models (LLMs) are increasingly leveraged to empower autonomous agents to simulate human beings in various fields of behavioral research. However, evaluating their capacity to navigate complex social interactions remains a challenge. Previous studies face limitations due to insufficient scenario diversity, complexity, and a single-perspective focus. To this end, we introduce AgentSen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19346v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19346v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19346v1-abstract-full" style="display: none;"> Large language models (LLMs) are increasingly leveraged to empower autonomous agents to simulate human beings in various fields of behavioral research. However, evaluating their capacity to navigate complex social interactions remains a challenge. Previous studies face limitations due to insufficient scenario diversity, complexity, and a single-perspective focus. To this end, we introduce AgentSense: Benchmarking Social Intelligence of Language Agents through Interactive Scenarios. Drawing on Dramaturgical Theory, AgentSense employs a bottom-up approach to create 1,225 diverse social scenarios constructed from extensive scripts. We evaluate LLM-driven agents through multi-turn interactions, emphasizing both goal completion and implicit reasoning. We analyze goals using ERG theory and conduct comprehensive experiments. Our findings highlight that LLMs struggle with goals in complex social scenarios, especially high-level growth needs, and even GPT-4o requires improvement in private information reasoning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19346v1-abstract-full').style.display = 'none'; document.getElementById('2410.19346v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19276">arXiv:2410.19276</a> <span> [<a href="https://arxiv.org/pdf/2410.19276">pdf</a>, <a href="https://arxiv.org/format/2410.19276">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Learning ID-free Item Representation with Token Crossing for Multimodal Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kangning Zhang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+J">Jiarui Jin</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+Y">Yingjie Qin</a>, <a href="/search/cs?searchtype=author&query=Su%2C+R">Ruilong Su</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jianghao Lin</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yong Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weinan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19276v1-abstract-short" style="display: inline;"> Current multimodal recommendation models have extensively explored the effective utilization of multimodal information; however, their reliance on ID embeddings remains a performance bottleneck. Even with the assistance of multimodal information, optimizing ID embeddings remains challenging for ID-based Multimodal Recommender when interaction data is sparse. Furthermore, the unique nature of item-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19276v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19276v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19276v1-abstract-full" style="display: none;"> Current multimodal recommendation models have extensively explored the effective utilization of multimodal information; however, their reliance on ID embeddings remains a performance bottleneck. Even with the assistance of multimodal information, optimizing ID embeddings remains challenging for ID-based Multimodal Recommender when interaction data is sparse. Furthermore, the unique nature of item-specific ID embeddings hinders the information exchange among related items and the spatial requirement of ID embeddings increases with the scale of item. Based on these limitations, we propose an ID-free MultimOdal TOken Representation scheme named MOTOR that represents each item using learnable multimodal tokens and connects them through shared tokens. Specifically, we first employ product quantization to discretize each item's multimodal features (e.g., images, text) into discrete token IDs. We then interpret the token embeddings corresponding to these token IDs as implicit item features, introducing a new Token Cross Network to capture the implicit interaction patterns among these tokens. The resulting representations can replace the original ID embeddings and transform the original ID-based multimodal recommender into ID-free system, without introducing any additional loss design. MOTOR reduces the overall space requirements of these models, facilitating information interaction among related items, while also significantly enhancing the model's recommendation capability. Extensive experiments on nine mainstream models demonstrate the significant performance improvement achieved by MOTOR, highlighting its effectiveness in enhancing multimodal recommendation systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19276v1-abstract-full').style.display = 'none'; document.getElementById('2410.19276v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages,6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19100">arXiv:2410.19100</a> <span> [<a href="https://arxiv.org/pdf/2410.19100">pdf</a>, <a href="https://arxiv.org/format/2410.19100">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> VideoWebArena: Evaluating Long Context Multimodal Agents with Video Understanding Web Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jang%2C+L">Lawrence Jang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yinheng Li</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+C">Charles Ding</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Justin Lin</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+P+P">Paul Pu Liang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+D">Dan Zhao</a>, <a href="/search/cs?searchtype=author&query=Bonatti%2C+R">Rogerio Bonatti</a>, <a href="/search/cs?searchtype=author&query=Koishida%2C+K">Kazuhito Koishida</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19100v1-abstract-short" style="display: inline;"> Videos are often used to learn or extract the necessary information to complete tasks in ways different than what text and static imagery alone can provide. However, many existing agent benchmarks neglect long-context video understanding, instead focusing on text or static image inputs. To bridge this gap, we introduce VideoWebArena (VideoWA), a benchmark for evaluating the capabilities of long-co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19100v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19100v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19100v1-abstract-full" style="display: none;"> Videos are often used to learn or extract the necessary information to complete tasks in ways different than what text and static imagery alone can provide. However, many existing agent benchmarks neglect long-context video understanding, instead focusing on text or static image inputs. To bridge this gap, we introduce VideoWebArena (VideoWA), a benchmark for evaluating the capabilities of long-context multimodal agents for video understanding. VideoWA consists of 2,021 web agent tasks based on manually crafted video tutorials, which total almost four hours of content. For our benchmark, we define a taxonomy of long-context video-based agent tasks with two main areas of focus: skill retention and factual retention. While skill retention tasks evaluate whether an agent can use a given human demonstration to complete a task efficiently, the factual retention task evaluates whether an agent can retrieve instruction-relevant information from a video to complete a task. We find that the best model achieves 13.3% success on factual retention tasks and 45.8% on factual retention QA pairs, far below human performance at 73.9% and 79.3%, respectively. On skill retention tasks, long-context models perform worse with tutorials than without, exhibiting a 5% performance decrease in WebArena tasks and a 10.3% decrease in VisualWebArena tasks. Our work highlights the need to improve the agentic abilities of long-context multimodal models and provides a testbed for future development with long-context video agents. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19100v1-abstract-full').style.display = 'none'; document.getElementById('2410.19100v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18639">arXiv:2410.18639</a> <span> [<a href="https://arxiv.org/pdf/2410.18639">pdf</a>, <a href="https://arxiv.org/format/2410.18639">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Diffusion Attribution Score: Evaluating Training Data Influence in Diffusion Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jinxu Lin</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+L">Linwei Tao</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+M">Minjing Dong</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chang Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18639v2-abstract-short" style="display: inline;"> As diffusion models become increasingly popular, the misuse of copyrighted and private images has emerged as a major concern. One promising solution to mitigate this issue is identifying the contribution of specific training samples in generative models, a process known as data attribution. Existing data attribution methods for diffusion models typically quantify the contribution of a training sam… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18639v2-abstract-full').style.display = 'inline'; document.getElementById('2410.18639v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18639v2-abstract-full" style="display: none;"> As diffusion models become increasingly popular, the misuse of copyrighted and private images has emerged as a major concern. One promising solution to mitigate this issue is identifying the contribution of specific training samples in generative models, a process known as data attribution. Existing data attribution methods for diffusion models typically quantify the contribution of a training sample by evaluating the change in diffusion loss when the sample is included or excluded from the training process. However, we argue that the direct usage of diffusion loss cannot represent such a contribution accurately due to the calculation of diffusion loss. Specifically, these approaches measure the divergence between predicted and ground truth distributions, which leads to an indirect comparison between the predicted distributions and cannot represent the variances between model behaviors. To address these issues, we aim to measure the direct comparison between predicted distributions with an attribution score to analyse the training sample importance, which is achieved by Diffusion Attribution Score (DAS). Underpinned by rigorous theoretical analysis, we elucidate the effectiveness of DAS. Additionally, we explore strategies to accelerate DAS calculations, facilitating its application to large-scale diffusion models. Our extensive experiments across various datasets and diffusion models demonstrate that DAS significantly surpasses previous benchmarks in terms of the linear data-modelling score, establishing new state-of-the-art performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18639v2-abstract-full').style.display = 'none'; document.getElementById('2410.18639v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18585">arXiv:2410.18585</a> <span> [<a href="https://arxiv.org/pdf/2410.18585">pdf</a>, <a href="https://arxiv.org/format/2410.18585">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Aligning CodeLLMs with Direct Preference Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Miao%2C+Y">Yibo Miao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+B">Bofei Gao</a>, <a href="/search/cs?searchtype=author&query=Quan%2C+S">Shanghaoran Quan</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Junyang Lin</a>, <a href="/search/cs?searchtype=author&query=Zan%2C+D">Daoguang Zan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiaheng Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jian Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianyu Liu</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Z">Zhijie Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18585v1-abstract-short" style="display: inline;"> The last year has witnessed the rapid progress of large language models (LLMs) across diverse domains. Among them, CodeLLMs have garnered particular attention because they can not only assist in completing various programming tasks but also represent the decision-making and logical reasoning capabilities of LLMs. However, current CodeLLMs mainly focus on pre-training and supervised fine-tuning sce… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18585v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18585v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18585v1-abstract-full" style="display: none;"> The last year has witnessed the rapid progress of large language models (LLMs) across diverse domains. Among them, CodeLLMs have garnered particular attention because they can not only assist in completing various programming tasks but also represent the decision-making and logical reasoning capabilities of LLMs. However, current CodeLLMs mainly focus on pre-training and supervised fine-tuning scenarios, leaving the alignment stage, which is important for post-training LLMs, under-explored. This work first identifies that the commonly used PPO algorithm may be suboptimal for the alignment of CodeLLM because the involved reward rules are routinely coarse-grained and potentially flawed. We then advocate addressing this using the DPO algorithm. Based on only preference data pairs, DPO can render the model rank data automatically, giving rise to a fine-grained rewarding pattern more robust than human intervention. We also contribute a pipeline for collecting preference pairs for DPO on CodeLLMs. Studies show that our method significantly improves the performance of existing CodeLLMs on benchmarks such as MBPP and HumanEval. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18585v1-abstract-full').style.display = 'none'; document.getElementById('2410.18585v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18540">arXiv:2410.18540</a> <span> [<a href="https://arxiv.org/pdf/2410.18540">pdf</a>, <a href="https://arxiv.org/ps/2410.18540">ps</a>, <a href="https://arxiv.org/format/2410.18540">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Logic in Computer Science">cs.LO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Formal Languages and Automata Theory">cs.FL</span> </div> </div> <p class="title is-5 mathjax"> Verifying Quantum Circuits with Level-Synchronized Tree Automata (Technical Report) </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Abdulla%2C+P+A">Parosh Aziz Abdulla</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yo-Ga Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yu-Fang Chen</a>, <a href="/search/cs?searchtype=author&query=Hol%C3%ADk%2C+L">Luk谩拧 Hol铆k</a>, <a href="/search/cs?searchtype=author&query=Leng%C3%A1l%2C+O">Ond艡ej Leng谩l</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jyun-Ao Lin</a>, <a href="/search/cs?searchtype=author&query=Lo%2C+F">Fang-Yi Lo</a>, <a href="/search/cs?searchtype=author&query=Tsai%2C+W">Wei-Lun Tsai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18540v1-abstract-short" style="display: inline;"> We present a new method for the verification of quantum circuits based on a novel symbolic representation of sets of quantum states using level-synchronized tree automata (LSTAs). LSTAs extend classical tree automata by labeling each transition with a set of choices, which are then used to synchronize subtrees of an accepted tree. Compared to the traditional tree automata, LSTAs have an incomparab… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18540v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18540v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18540v1-abstract-full" style="display: none;"> We present a new method for the verification of quantum circuits based on a novel symbolic representation of sets of quantum states using level-synchronized tree automata (LSTAs). LSTAs extend classical tree automata by labeling each transition with a set of choices, which are then used to synchronize subtrees of an accepted tree. Compared to the traditional tree automata, LSTAs have an incomparable expressive power while maintaining important properties, such as closure under union and intersection, and decidable language emptiness and inclusion. We have developed an efficient and fully automated symbolic verification algorithm for quantum circuits based on LSTAs. The complexity of supported gate operations is at most quadratic, dramatically improving the exponential worst-case complexity of an earlier tree automata-based approach. Furthermore, we show that LSTAs are a promising model for parameterized verification, i.e., verifying the correctness of families of circuits with the same structure for any number of qubits involved, which principally lies beyond the capabilities of previous automated approaches. We implemented this method as a C++ tool and compared it with three symbolic quantum circuit verifiers and two simulators on several benchmark examples. The results show that our approach can solve problems with sizes orders of magnitude larger than the state of the art. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18540v1-abstract-full').style.display = 'none'; document.getElementById('2410.18540v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17131">arXiv:2410.17131</a> <span> [<a href="https://arxiv.org/pdf/2410.17131">pdf</a>, <a href="https://arxiv.org/format/2410.17131">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Aligning Large Language Models via Self-Steering Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiang%2C+H">Hao Xiang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+B">Bowen Yu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hongyu Lin</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+K">Keming Lu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yaojie Lu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xianpei Han</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+L">Le Sun</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jingren Zhou</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Junyang Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17131v1-abstract-short" style="display: inline;"> Automated alignment develops alignment systems with minimal human intervention. The key to automated alignment lies in providing learnable and accurate preference signals for preference learning without human annotation. In this paper, we introduce Self-Steering Optimization ($SSO$), an algorithm that autonomously generates high-quality preference signals based on predefined principles during iter… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17131v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17131v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17131v1-abstract-full" style="display: none;"> Automated alignment develops alignment systems with minimal human intervention. The key to automated alignment lies in providing learnable and accurate preference signals for preference learning without human annotation. In this paper, we introduce Self-Steering Optimization ($SSO$), an algorithm that autonomously generates high-quality preference signals based on predefined principles during iterative training, eliminating the need for manual annotation. $SSO$ maintains the accuracy of signals by ensuring a consistent gap between chosen and rejected responses while keeping them both on-policy to suit the current policy model's learning capacity. $SSO$ can benefit the online and offline training of the policy model, as well as enhance the training of reward models. We validate the effectiveness of $SSO$ with two foundation models, Qwen2 and Llama3.1, indicating that it provides accurate, on-policy preference signals throughout iterative training. Without any manual annotation or external models, $SSO$ leads to significant performance improvements across six subjective or objective benchmarks. Besides, the preference data generated by $SSO$ significantly enhanced the performance of the reward model on Rewardbench. Our work presents a scalable approach to preference optimization, paving the way for more efficient and effective automated alignment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17131v1-abstract-full').style.display = 'none'; document.getElementById('2410.17131v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16788">arXiv:2410.16788</a> <span> [<a href="https://arxiv.org/pdf/2410.16788">pdf</a>, <a href="https://arxiv.org/format/2410.16788">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Correct after Answer: Enhancing Multi-Span Question Answering with Post-Processing Method </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jiayi Lin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenyang Zhang</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+H">Haibo Tong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Dongyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+Q">Qingqing Hong</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+B">Bingxuan Hou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Junli Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16788v1-abstract-short" style="display: inline;"> Multi-Span Question Answering (MSQA) requires models to extract one or multiple answer spans from a given context to answer a question. Prior work mainly focuses on designing specific methods or applying heuristic strategies to encourage models to predict more correct predictions. However, these models are trained on gold answers and fail to consider the incorrect predictions. Through a statistica… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16788v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16788v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16788v1-abstract-full" style="display: none;"> Multi-Span Question Answering (MSQA) requires models to extract one or multiple answer spans from a given context to answer a question. Prior work mainly focuses on designing specific methods or applying heuristic strategies to encourage models to predict more correct predictions. However, these models are trained on gold answers and fail to consider the incorrect predictions. Through a statistical analysis, we observe that models with stronger abilities do not predict less incorrect predictions compared with other models. In this work, we propose Answering-Classifying-Correcting (ACC) framework, which employs a post-processing strategy to handle incorrect predictions. Specifically, the ACC framework first introduces a classifier to classify the predictions into three types and exclude "wrong predictions", then introduces a corrector to modify "partially correct predictions". Experiments on several MSQA datasets show that ACC framework significantly improves the Exact Match (EM) scores, and further analysis demostrates that ACC framework efficiently reduces the number of incorrect predictions, improving the quality of predictions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16788v1-abstract-full').style.display = 'none'; document.getElementById('2410.16788v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by EMNLP 2024 Findings</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16080">arXiv:2410.16080</a> <span> [<a href="https://arxiv.org/pdf/2410.16080">pdf</a>, <a href="https://arxiv.org/format/2410.16080">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Unleashing the Potential of Multi-Channel Fusion in Retrieval for Personalized Recommendations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+J">Junjie Huang</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+J">Jiarui Qin</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jianghao Lin</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Z">Ziming Feng</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yong Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weinan Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16080v1-abstract-short" style="display: inline;"> Recommender systems (RS) are pivotal in managing information overload in modern digital services. A key challenge in RS is efficiently processing vast item pools to deliver highly personalized recommendations under strict latency constraints. Multi-stage cascade ranking addresses this by employing computationally efficient retrieval methods to cover diverse user interests, followed by more precise… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16080v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16080v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16080v1-abstract-full" style="display: none;"> Recommender systems (RS) are pivotal in managing information overload in modern digital services. A key challenge in RS is efficiently processing vast item pools to deliver highly personalized recommendations under strict latency constraints. Multi-stage cascade ranking addresses this by employing computationally efficient retrieval methods to cover diverse user interests, followed by more precise ranking models to refine the results. In the retrieval stage, multi-channel retrieval is often used to generate distinct item subsets from different candidate generators, leveraging the complementary strengths of these methods to maximize coverage. However, forwarding all retrieved items overwhelms downstream rankers, necessitating truncation. Despite advancements in individual retrieval methods, multi-channel fusion, the process of efficiently merging multi-channel retrieval results, remains underexplored. We are the first to identify and systematically investigate multi-channel fusion in the retrieval stage. Current industry practices often rely on heuristic approaches and manual designs, which often lead to suboptimal performance. Moreover, traditional gradient-based methods like SGD are unsuitable for this task due to the non-differentiable nature of the selection process. In this paper, we explore advanced channel fusion strategies by assigning systematically optimized weights to each channel. We utilize black-box optimization techniques, including the Cross Entropy Method and Bayesian Optimization for global weight optimization, alongside policy gradient-based approaches for personalized merging. Our methods enhance both personalization and flexibility, achieving significant performance improvements across multiple datasets and yielding substantial gains in real-world deployments, offering a scalable solution for optimizing multi-channel fusion in retrieval. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16080v1-abstract-full').style.display = 'none'; document.getElementById('2410.16080v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.15827">arXiv:2410.15827</a> <span> [<a href="https://arxiv.org/pdf/2410.15827">pdf</a>, <a href="https://arxiv.org/format/2410.15827">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Explainability of Highly Associated Fuzzy Churn Patterns in Binary Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+D+Y+C">D. Y. C. Wang</a>, <a href="/search/cs?searchtype=author&query=Jordanger%2C+L+A">Lars Arne Jordanger</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J+C">Jerry Chun-Wei Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.15827v1-abstract-short" style="display: inline;"> Customer churn, particularly in the telecommunications sector, influences both costs and profits. As the explainability of models becomes increasingly important, this study emphasizes not only the explainability of customer churn through machine learning models, but also the importance of identifying multivariate patterns and setting soft bounds for intuitive interpretation. The main objective is… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15827v1-abstract-full').style.display = 'inline'; document.getElementById('2410.15827v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.15827v1-abstract-full" style="display: none;"> Customer churn, particularly in the telecommunications sector, influences both costs and profits. As the explainability of models becomes increasingly important, this study emphasizes not only the explainability of customer churn through machine learning models, but also the importance of identifying multivariate patterns and setting soft bounds for intuitive interpretation. The main objective is to use a machine learning model and fuzzy-set theory with top-\textit{k} HUIM to identify highly associated patterns of customer churn with intuitive identification, referred to as Highly Associated Fuzzy Churn Patterns (HAFCP). Moreover, this method aids in uncovering association rules among multiple features across low, medium, and high distributions. Such discoveries are instrumental in enhancing the explainability of findings. Experiments show that when the top-5 HAFCPs are included in five datasets, a mixture of performance results is observed, with some showing notable improvements. It becomes clear that high importance features enhance explanatory power through their distribution and patterns associated with other features. As a result, the study introduces an innovative approach that improves the explainability and effectiveness of customer churn prediction models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.15827v1-abstract-full').style.display = 'none'; document.getElementById('2410.15827v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages single columns, 4 figures, This paper is an extended version of a work originally presented at the 6th International Workshop on Utility-Driven Mining and Learning (held in conjunction with the 28th Pacific-Asia Conference on Knowledge Discovery and Data Mining - PAKDD 2024) on May 7, 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14961">arXiv:2410.14961</a> <span> [<a href="https://arxiv.org/pdf/2410.14961">pdf</a>, <a href="https://arxiv.org/format/2410.14961">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> LangGFM: A Large Language Model Alone Can be a Powerful Graph Foundation Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+T">Tianqianjin Lin</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+P">Pengwei Yan</a>, <a href="/search/cs?searchtype=author&query=Song%2C+K">Kaisong Song</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhuoren Jiang</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+Y">Yangyang Kang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jun Lin</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+W">Weikang Yuan</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+J">Junjie Cao</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+C">Changlong Sun</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaozhong Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14961v1-abstract-short" style="display: inline;"> Graph foundation models (GFMs) have recently gained significant attention. However, the unique data processing and evaluation setups employed by different studies hinder a deeper understanding of their progress. Additionally, current research tends to focus on specific subsets of graph learning tasks, such as structural tasks, node-level tasks, or classification tasks. As a result, they often inco… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14961v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14961v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14961v1-abstract-full" style="display: none;"> Graph foundation models (GFMs) have recently gained significant attention. However, the unique data processing and evaluation setups employed by different studies hinder a deeper understanding of their progress. Additionally, current research tends to focus on specific subsets of graph learning tasks, such as structural tasks, node-level tasks, or classification tasks. As a result, they often incorporate specialized modules tailored to particular task types, losing their applicability to other graph learning tasks and contradicting the original intent of foundation models to be universal. Therefore, to enhance consistency, coverage, and diversity across domains, tasks, and research interests within the graph learning community in the evaluation of GFMs, we propose GFMBench-a systematic and comprehensive benchmark comprising 26 datasets. Moreover, we introduce LangGFM, a novel GFM that relies entirely on large language models. By revisiting and exploring the effective graph textualization principles, as well as repurposing successful techniques from graph augmentation and graph self-supervised learning within the language space, LangGFM achieves performance on par with or exceeding the state of the art across GFMBench, which can offer us new perspectives, experiences, and baselines to drive forward the evolution of GFMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14961v1-abstract-full').style.display = 'none'; document.getElementById('2410.14961v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14144">arXiv:2410.14144</a> <span> [<a href="https://arxiv.org/pdf/2410.14144">pdf</a>, <a href="https://arxiv.org/format/2410.14144">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A Lightweight Multi Aspect Controlled Text Generation Solution For Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenyang Zhang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jiayi Lin</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+H">Haibo Tong</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+B">Bingxuan Hou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+D">Dongyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jialin Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Junli Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14144v1-abstract-short" style="display: inline;"> Large language models (LLMs) show remarkable abilities with instruction tuning. However, they fail to achieve ideal tasks when lacking high-quality instruction tuning data on target tasks. Multi-Aspect Controllable Text Generation (MCTG) is a representative task for this dilemma, where aspect datasets are usually biased and correlated. Existing work exploits additional model structures and strateg… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14144v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14144v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14144v1-abstract-full" style="display: none;"> Large language models (LLMs) show remarkable abilities with instruction tuning. However, they fail to achieve ideal tasks when lacking high-quality instruction tuning data on target tasks. Multi-Aspect Controllable Text Generation (MCTG) is a representative task for this dilemma, where aspect datasets are usually biased and correlated. Existing work exploits additional model structures and strategies for solutions, limiting adaptability to LLMs. To activate MCTG ability of LLMs, we propose a lightweight MCTG pipeline based on data augmentation. We analyze bias and correlations in traditional datasets, and address these concerns with augmented control attributes and sentences. Augmented datasets are feasible for instruction tuning. In our experiments, LLMs perform better in MCTG after data augmentation, with a 20% accuracy rise and less aspect correlations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14144v1-abstract-full').style.display = 'none'; document.getElementById('2410.14144v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13716">arXiv:2410.13716</a> <span> [<a href="https://arxiv.org/pdf/2410.13716">pdf</a>, <a href="https://arxiv.org/format/2410.13716">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MIRAGE-Bench: Automatic Multilingual Benchmark Arena for Retrieval-Augmented Generation Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Thakur%2C+N">Nandan Thakur</a>, <a href="/search/cs?searchtype=author&query=Kazi%2C+S">Suleman Kazi</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+G">Ge Luo</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jimmy Lin</a>, <a href="/search/cs?searchtype=author&query=Ahmad%2C+A">Amin Ahmad</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13716v1-abstract-short" style="display: inline;"> Traditional Retrieval-Augmented Generation (RAG) benchmarks rely on different heuristic-based metrics for evaluation, but these require human preferences as ground truth for reference. In contrast, arena-based benchmarks, where two models compete each other, require an expensive Large Language Model (LLM) as a judge for a reliable evaluation. We present an easy and efficient technique to get the b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13716v1-abstract-full').style.display = 'inline'; document.getElementById('2410.13716v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13716v1-abstract-full" style="display: none;"> Traditional Retrieval-Augmented Generation (RAG) benchmarks rely on different heuristic-based metrics for evaluation, but these require human preferences as ground truth for reference. In contrast, arena-based benchmarks, where two models compete each other, require an expensive Large Language Model (LLM) as a judge for a reliable evaluation. We present an easy and efficient technique to get the best of both worlds. The idea is to train a learning to rank model as a "surrogate" judge using RAG-based evaluation heuristics as input, to produce a synthetic arena-based leaderboard. Using this idea, We develop MIRAGE-Bench, a standardized arena-based multilingual RAG benchmark for 18 diverse languages on Wikipedia. The benchmark is constructed using MIRACL, a retrieval dataset, and extended for multilingual generation evaluation. MIRAGE-Bench evaluates RAG extensively coupling both heuristic features and LLM as a judge evaluator. In our work, we benchmark 19 diverse multilingual-focused LLMs, and achieve a high correlation (Kendall Tau ($蟿$) = 0.909) using our surrogate judge learned using heuristic features with pairwise evaluations and between GPT-4o as a teacher on the MIRAGE-Bench leaderboard using the Bradley-Terry framework. We observe proprietary and large open-source LLMs currently dominate in multilingual RAG. MIRAGE-Bench is available at: https://github.com/vectara/mirage-bench. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13716v1-abstract-full').style.display = 'none'; document.getElementById('2410.13716v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11473">arXiv:2410.11473</a> <span> [<a href="https://arxiv.org/pdf/2410.11473">pdf</a>, <a href="https://arxiv.org/format/2410.11473">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> InvSeg: Test-Time Prompt Inversion for Semantic Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jiayi Lin</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jiabo Huang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jian Hu</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+S">Shaogang Gong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11473v1-abstract-short" style="display: inline;"> Visual-textual correlations in the attention maps derived from text-to-image diffusion models are proven beneficial to dense visual prediction tasks, e.g., semantic segmentation. However, a significant challenge arises due to the input distributional discrepancy between the context-rich sentences used for image generation and the isolated class names typically employed in semantic segmentation, hi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11473v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11473v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11473v1-abstract-full" style="display: none;"> Visual-textual correlations in the attention maps derived from text-to-image diffusion models are proven beneficial to dense visual prediction tasks, e.g., semantic segmentation. However, a significant challenge arises due to the input distributional discrepancy between the context-rich sentences used for image generation and the isolated class names typically employed in semantic segmentation, hindering the diffusion models from capturing accurate visual-textual correlations. To solve this, we propose InvSeg, a test-time prompt inversion method that tackles open-vocabulary semantic segmentation by inverting image-specific visual context into text prompt embedding space, leveraging structure information derived from the diffusion model's reconstruction process to enrich text prompts so as to associate each class with a structure-consistent mask. Specifically, we introduce Contrastive Soft Clustering (CSC) to align derived masks with the image's structure information, softly selecting anchors for each class and calculating weighted distances to push inner-class pixels closer while separating inter-class pixels, thereby ensuring mask distinction and internal consistency. By incorporating sample-specific context, InvSeg learns context-rich text prompts in embedding space and achieves accurate semantic alignment across modalities. Experiments show that InvSeg achieves state-of-the-art performance on the PASCAL VOC and Context datasets. Project page: https://jylin8100.github.io/InvSegProject/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11473v1-abstract-full').style.display = 'none'; document.getElementById('2410.11473v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11235">arXiv:2410.11235</a> <span> [<a href="https://arxiv.org/pdf/2410.11235">pdf</a>, <a href="https://arxiv.org/format/2410.11235">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Unleashing the Power of LLMs as Multi-Modal Encoders for Text and Graph-Structured Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jiacheng Lin</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+K">Kun Qian</a>, <a href="/search/cs?searchtype=author&query=Han%2C+H">Haoyu Han</a>, <a href="/search/cs?searchtype=author&query=Choudhary%2C+N">Nurendra Choudhary</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+T">Tianxin Wei</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhongruo Wang</a>, <a href="/search/cs?searchtype=author&query=Genc%2C+S">Sahika Genc</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+E+W">Edward W Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Sheng Wang</a>, <a href="/search/cs?searchtype=author&query=Subbian%2C+K">Karthik Subbian</a>, <a href="/search/cs?searchtype=author&query=Koutra%2C+D">Danai Koutra</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jimeng Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11235v1-abstract-short" style="display: inline;"> Graph-structured information offers rich contextual information that can enhance language models by providing structured relationships and hierarchies, leading to more expressive embeddings for various applications such as retrieval, question answering, and classification. However, existing methods for integrating graph and text embeddings, often based on Multi-layer Perceptrons (MLPs) or shallow… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11235v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11235v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11235v1-abstract-full" style="display: none;"> Graph-structured information offers rich contextual information that can enhance language models by providing structured relationships and hierarchies, leading to more expressive embeddings for various applications such as retrieval, question answering, and classification. However, existing methods for integrating graph and text embeddings, often based on Multi-layer Perceptrons (MLPs) or shallow transformers, are limited in their ability to fully exploit the heterogeneous nature of these modalities. To overcome this, we propose Janus, a simple yet effective framework that leverages Large Language Models (LLMs) to jointly encode text and graph data. Specifically, Janus employs an MLP adapter to project graph embeddings into the same space as text embeddings, allowing the LLM to process both modalities jointly. Unlike prior work, we also introduce contrastive learning to align the graph and text spaces more effectively, thereby improving the quality of learned joint embeddings. Empirical results across six datasets spanning three tasks, knowledge graph-contextualized question answering, graph-text pair classification, and retrieval, demonstrate that Janus consistently outperforms existing baselines, achieving significant improvements across multiple datasets, with gains of up to 11.4% in QA tasks. These results highlight Janus's effectiveness in integrating graph and text data. Ablation studies further validate the effectiveness of our method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11235v1-abstract-full').style.display = 'none'; document.getElementById('2410.11235v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11123">arXiv:2410.11123</a> <span> [<a href="https://arxiv.org/pdf/2410.11123">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> A Systematic Review on Prompt Engineering in Large Language Models for K-12 STEM Education </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+E">Eason Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Danyang Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+L">Luyi Xu</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+C">Chen Cao</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+X">Xiao Fang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jionghao Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11123v1-abstract-short" style="display: inline;"> Large language models (LLMs) have the potential to enhance K-12 STEM education by improving both teaching and learning processes. While previous studies have shown promising results, there is still a lack of comprehensive understanding regarding how LLMs are effectively applied, specifically through prompt engineering-the process of designing prompts to generate desired outputs. To address this ga… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11123v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11123v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11123v1-abstract-full" style="display: none;"> Large language models (LLMs) have the potential to enhance K-12 STEM education by improving both teaching and learning processes. While previous studies have shown promising results, there is still a lack of comprehensive understanding regarding how LLMs are effectively applied, specifically through prompt engineering-the process of designing prompts to generate desired outputs. To address this gap, our study investigates empirical research published between 2021 and 2024 that explores the use of LLMs combined with prompt engineering in K-12 STEM education. Following the PRISMA protocol, we screened 2,654 papers and selected 30 studies for analysis. Our review identifies the prompting strategies employed, the types of LLMs used, methods of evaluating effectiveness, and limitations in prior work. Results indicate that while simple and zero-shot prompting are commonly used, more advanced techniques like few-shot and chain-of-thought prompting have demonstrated positive outcomes for various educational tasks. GPT-series models are predominantly used, but smaller and fine-tuned models (e.g., Blender 7B) paired with effective prompt engineering outperform prompting larger models (e.g., GPT-3) in specific contexts. Evaluation methods vary significantly, with limited empirical validation in real-world settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11123v1-abstract-full').style.display = 'none'; document.getElementById('2410.11123v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10745">arXiv:2410.10745</a> <span> [<a href="https://arxiv.org/pdf/2410.10745">pdf</a>, <a href="https://arxiv.org/format/2410.10745">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> FlexGen: Flexible Multi-View Generation from Text and Image Inputs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xinli Xu</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+W">Wenhang Ge</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jiantao Lin</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+J">Jiawei Feng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+L">Lie Xu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">HanFeng Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shunsi Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Ying-Cong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10745v1-abstract-short" style="display: inline;"> In this work, we introduce FlexGen, a flexible framework designed to generate controllable and consistent multi-view images, conditioned on a single-view image, or a text prompt, or both. FlexGen tackles the challenges of controllable multi-view synthesis through additional conditioning on 3D-aware text annotations. We utilize the strong reasoning capabilities of GPT-4V to generate 3D-aware text a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10745v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10745v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10745v1-abstract-full" style="display: none;"> In this work, we introduce FlexGen, a flexible framework designed to generate controllable and consistent multi-view images, conditioned on a single-view image, or a text prompt, or both. FlexGen tackles the challenges of controllable multi-view synthesis through additional conditioning on 3D-aware text annotations. We utilize the strong reasoning capabilities of GPT-4V to generate 3D-aware text annotations. By analyzing four orthogonal views of an object arranged as tiled multi-view images, GPT-4V can produce text annotations that include 3D-aware information with spatial relationship. By integrating the control signal with proposed adaptive dual-control module, our model can generate multi-view images that correspond to the specified text. FlexGen supports multiple controllable capabilities, allowing users to modify text prompts to generate reasonable and corresponding unseen parts. Additionally, users can influence attributes such as appearance and material properties, including metallic and roughness. Extensive experiments demonstrate that our approach offers enhanced multiple controllability, marking a significant advancement over existing multi-view diffusion models. This work has substantial implications for fields requiring rapid and flexible 3D content creation, including game development, animation, and virtual reality. Project page: https://xxu068.github.io/flexgen.github.io/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10745v1-abstract-full').style.display = 'none'; document.getElementById('2410.10745v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 13 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10394">arXiv:2410.10394</a> <span> [<a href="https://arxiv.org/pdf/2410.10394">pdf</a>, <a href="https://arxiv.org/format/2410.10394">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> PIVOT-R: Primitive-Driven Waypoint-Aware World Model for Robotic Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kaidong Zhang</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+P">Pengzhen Ren</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B">Bingqian Lin</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Junfan Lin</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+S">Shikui Ma</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Hang Xu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xiaodan Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10394v2-abstract-short" style="display: inline;"> Language-guided robotic manipulation is a challenging task that requires an embodied agent to follow abstract user instructions to accomplish various complex manipulation tasks. Previous work trivially fitting the data without revealing the relation between instruction and low-level executable actions, these models are prone to memorizing the surficial pattern of the data instead of acquiring the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10394v2-abstract-full').style.display = 'inline'; document.getElementById('2410.10394v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10394v2-abstract-full" style="display: none;"> Language-guided robotic manipulation is a challenging task that requires an embodied agent to follow abstract user instructions to accomplish various complex manipulation tasks. Previous work trivially fitting the data without revealing the relation between instruction and low-level executable actions, these models are prone to memorizing the surficial pattern of the data instead of acquiring the transferable knowledge, and thus are fragile to dynamic environment changes. To address this issue, we propose a PrIrmitive-driVen waypOinT-aware world model for Robotic manipulation (PIVOT-R) that focuses solely on the prediction of task-relevant waypoints. Specifically, PIVOT-R consists of a Waypoint-aware World Model (WAWM) and a lightweight action prediction module. The former performs primitive action parsing and primitive-driven waypoint prediction, while the latter focuses on decoding low-level actions. Additionally, we also design an asynchronous hierarchical executor (AHE), which can use different execution frequencies for different modules of the model, thereby helping the model reduce computational redundancy and improve model execution efficiency. Our PIVOT-R outperforms state-of-the-art (SoTA) open-source models on the SeaWave benchmark, achieving an average relative improvement of 19.45% across four levels of instruction tasks. Moreover, compared to the synchronously executed PIVOT-R, the execution efficiency of PIVOT-R with AHE is increased by 28-fold, with only a 2.9% drop in performance. These results provide compelling evidence that our PIVOT-R can significantly improve both the performance and efficiency of robotic manipulation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10394v2-abstract-full').style.display = 'none'; document.getElementById('2410.10394v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09335">arXiv:2410.09335</a> <span> [<a href="https://arxiv.org/pdf/2410.09335">pdf</a>, <a href="https://arxiv.org/format/2410.09335">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Rethinking Data Selection at Scale: Random Selection is Almost All You Need </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xia%2C+T">Tingyu Xia</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+B">Bowen Yu</a>, <a href="/search/cs?searchtype=author&query=Dang%2C+K">Kai Dang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+A">An Yang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yuan Wu</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Y">Yuan Tian</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+Y">Yi Chang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Junyang Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09335v1-abstract-short" style="display: inline;"> Supervised fine-tuning (SFT) is crucial for aligning Large Language Models (LLMs) with human instructions. The primary goal during SFT is to select a small yet representative subset of training data from the larger pool, such that fine-tuning with this subset achieves results comparable to or even exceeding those obtained using the entire dataset. However, most existing data selection techniques a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09335v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09335v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09335v1-abstract-full" style="display: none;"> Supervised fine-tuning (SFT) is crucial for aligning Large Language Models (LLMs) with human instructions. The primary goal during SFT is to select a small yet representative subset of training data from the larger pool, such that fine-tuning with this subset achieves results comparable to or even exceeding those obtained using the entire dataset. However, most existing data selection techniques are designed for small-scale data pools, which fail to meet the demands of real-world SFT scenarios. In this paper, we replicated several self-scoring methods those that do not rely on external model assistance on two million scale datasets, and found that nearly all methods struggled to significantly outperform random selection when dealing with such large-scale data pools. Moreover, our comparisons suggest that, during SFT, diversity in data selection is more critical than simply focusing on high quality data. We also analyzed the limitations of several current approaches, explaining why they perform poorly on large-scale datasets and why they are unsuitable for such contexts. Finally, we found that filtering data by token length offers a stable and efficient method for improving results. This approach, particularly when training on long text data, proves highly beneficial for relatively weaker base models, such as Llama3. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09335v1-abstract-full').style.display = 'none'; document.getElementById('2410.09335v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09239">arXiv:2410.09239</a> <span> [<a href="https://arxiv.org/pdf/2410.09239">pdf</a>, <a href="https://arxiv.org/format/2410.09239">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Scaling Gaussian Processes for Learning Curve Prediction via Latent Kronecker Structure </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+J+A">Jihao Andreas Lin</a>, <a href="/search/cs?searchtype=author&query=Ament%2C+S">Sebastian Ament</a>, <a href="/search/cs?searchtype=author&query=Balandat%2C+M">Maximilian Balandat</a>, <a href="/search/cs?searchtype=author&query=Bakshy%2C+E">Eytan Bakshy</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09239v1-abstract-short" style="display: inline;"> A key task in AutoML is to model learning curves of machine learning models jointly as a function of model hyper-parameters and training progression. While Gaussian processes (GPs) are suitable for this task, na茂ve GPs require $\mathcal{O}(n^3m^3)$ time and $\mathcal{O}(n^2 m^2)$ space for $n$ hyper-parameter configurations and $\mathcal{O}(m)$ learning curve observations per hyper-parameter. Effi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09239v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09239v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09239v1-abstract-full" style="display: none;"> A key task in AutoML is to model learning curves of machine learning models jointly as a function of model hyper-parameters and training progression. While Gaussian processes (GPs) are suitable for this task, na茂ve GPs require $\mathcal{O}(n^3m^3)$ time and $\mathcal{O}(n^2 m^2)$ space for $n$ hyper-parameter configurations and $\mathcal{O}(m)$ learning curve observations per hyper-parameter. Efficient inference via Kronecker structure is typically incompatible with early-stopping due to missing learning curve values. We impose $\textit{latent Kronecker structure}$ to leverage efficient product kernels while handling missing values. In particular, we interpret the joint covariance matrix of observed values as the projection of a latent Kronecker product. Combined with iterative linear solvers and structured matrix-vector multiplication, our method only requires $\mathcal{O}(n^3 + m^3)$ time and $\mathcal{O}(n^2 + m^2)$ space. We show that our GP model can match the performance of a Transformer on a learning curve prediction task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09239v1-abstract-full').style.display = 'none'; document.getElementById('2410.09239v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Bayesian Decision-making and Uncertainty Workshop at NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08935">arXiv:2410.08935</a> <span> [<a href="https://arxiv.org/pdf/2410.08935">pdf</a>, <a href="https://arxiv.org/format/2410.08935">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Voxel-SLAM: A Complete, Accurate, and Versatile LiDAR-Inertial SLAM System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zheng Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haotian Li</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+C">Chongjian Yuan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jiarong Lin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Rundong Li</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+C">Chunran Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+B">Bingyang Zhou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wenyi Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+F">Fu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08935v1-abstract-short" style="display: inline;"> In this work, we present Voxel-SLAM: a complete, accurate, and versatile LiDAR-inertial SLAM system that fully utilizes short-term, mid-term, long-term, and multi-map data associations to achieve real-time estimation and high precision mapping. The system consists of five modules: initialization, odometry, local mapping, loop closure, and global mapping, all employing the same map representation,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08935v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08935v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08935v1-abstract-full" style="display: none;"> In this work, we present Voxel-SLAM: a complete, accurate, and versatile LiDAR-inertial SLAM system that fully utilizes short-term, mid-term, long-term, and multi-map data associations to achieve real-time estimation and high precision mapping. The system consists of five modules: initialization, odometry, local mapping, loop closure, and global mapping, all employing the same map representation, an adaptive voxel map. The initialization provides an accurate initial state estimation and a consistent local map for subsequent modules, enabling the system to start with a highly dynamic initial state. The odometry, exploiting the short-term data association, rapidly estimates current states and detects potential system divergence. The local mapping, exploiting the mid-term data association, employs a local LiDAR-inertial bundle adjustment (BA) to refine the states (and the local map) within a sliding window of recent LiDAR scans. The loop closure detects previously visited places in the current and all previous sessions. The global mapping refines the global map with an efficient hierarchical global BA. The loop closure and global mapping both exploit long-term and multi-map data associations. We conducted a comprehensive benchmark comparison with other state-of-the-art methods across 30 sequences from three representative scenes, including narrow indoor environments using hand-held equipment, large-scale wilderness environments with aerial robots, and urban environments on vehicle platforms. Other experiments demonstrate the robustness and efficiency of the initialization, the capacity to work in multiple sessions, and relocalization in degenerated environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08935v1-abstract-full').style.display = 'none'; document.getElementById('2410.08935v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Lin%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Lin%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Lin%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Lin%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Lin%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Lin%2C+J&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository