From 4ee5a1d11471d25656e5073ed93d970ad5d2ac84 Mon Sep 17 00:00:00 2001
From: Roberto Rodriguez <9653181+Cyb3rWard0g@users.noreply.github.com>
Date: Mon, 13 Jan 2025 21:33:28 -0500
Subject: [PATCH] Updated arxiv module to include a summary whenever user wants
 to.

---
 cookbook/arxiv_search.ipynb         | 868 ++++++++++++++++------------
 src/floki/document/fetcher/arxiv.py |  87 ++-
 2 files changed, 570 insertions(+), 385 deletions(-)

diff --git a/cookbook/arxiv_search.ipynb b/cookbook/arxiv_search.ipynb
index 6ccd354..65a85ab 100644
--- a/cookbook/arxiv_search.ipynb
+++ b/cookbook/arxiv_search.ipynb
@@ -56,16 +56,7 @@
    "cell_type": "code",
    "execution_count": 2,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/Users/wardog/Documents/GitHub/floki/.venv/lib/python3.12/site-packages/pydantic/_internal/_generate_schema.py:777: UserWarning: Mixing V1 models and V2 models (or constructs, like `TypeAdapter`) is not supported. Please upgrade `Settings` to V2.\n",
-      "  warn(\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from floki.document import ArxivFetcher\n",
     "\n",
@@ -93,7 +84,7 @@
      "text": [
       "INFO:floki.document.fetcher.arxiv:Searching for query: machine learning\n",
       "INFO:arxiv:Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=machine+learning&id_list=&sortBy=submittedDate&sortOrder=descending&start=0&max_results=100\n",
-      "INFO:arxiv:Got first page: 100 of 374510 total results\n",
+      "INFO:arxiv:Got first page: 100 of 376201 total results\n",
       "INFO:floki.document.fetcher.arxiv:Found 5 results for query: machine learning\n"
      ]
     },
@@ -101,121 +92,110 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Title: PERSE: Personalized 3D Generative Avatars from A Single Portrait\n",
-      "Authors: Hyunsoo Cha, Inhee Lee, Hanbyul Joo\n",
-      "Summary: We present PERSE, a method for building an animatable personalized generative\n",
-      "avatar from a reference portrait. Our avatar model enables facial attribute\n",
-      "editing in a continuous and disentangled latent space to control each facial\n",
-      "attribute, while preserving the individual's identity. To achieve this, our\n",
-      "method begins by synthesizing large-scale synthetic 2D video datasets, where\n",
-      "each video contains consistent changes in the facial expression and viewpoint,\n",
-      "combined with a variation in a specific facial attribute from the original\n",
-      "input. We propose a novel pipeline to produce high-quality, photorealistic 2D\n",
-      "videos with facial attribute editing. Leveraging this synthetic attribute\n",
-      "dataset, we present a personalized avatar creation method based on the 3D\n",
-      "Gaussian Splatting, learning a continuous and disentangled latent space for\n",
-      "intuitive facial attribute manipulation. To enforce smooth transitions in this\n",
-      "latent space, we introduce a latent space regularization technique by using\n",
-      "interpolated 2D faces as supervision. Compared to previous approaches, we\n",
-      "demonstrate that PERSE generates high-quality avatars with interpolated\n",
-      "attributes while preserving identity of reference person.\n",
+      "Title: LlamaV-o1: Rethinking Step-by-step Visual Reasoning in LLMs\n",
+      "Authors: Omkar Thawakar, Dinura Dissanayake, Ketan More, Ritesh Thawkar, Ahmed Heakl, Noor Ahsan, Yuhao Li, Mohammed Zumri, Jean Lahoud, Rao Muhammad Anwer, Hisham Cholakkal, Ivan Laptev, Mubarak Shah, Fahad Shahbaz Khan, Salman Khan\n",
+      "Summary: Reasoning is a fundamental capability for solving complex multi-step\n",
+      "problems, particularly in visual contexts where sequential step-wise\n",
+      "understanding is essential. Existing approaches lack a comprehensive framework\n",
+      "for evaluating visual reasoning and do not emphasize step-wise problem-solving.\n",
+      "To this end, we propose a comprehensive framework for advancing step-by-step\n",
+      "visual reasoning in large language models (LMMs) through three key\n",
+      "contributions. First, we introduce a visual reasoning benchmark specifically\n",
+      "designed to evaluate multi-step reasoning tasks. The benchmark presents a\n",
+      "diverse set of challenges with eight different categories ranging from complex\n",
+      "visual perception to scientific reasoning with over 4k reasoning steps in\n",
+      "total, enabling robust evaluation of LLMs' abilities to perform accurate and\n",
+      "interpretable visual reasoning across multiple steps. Second, we propose a\n",
+      "novel metric that assesses visual reasoning quality at the granularity of\n",
+      "individual steps, emphasizing both correctness and logical coherence. The\n",
+      "proposed metric offers deeper insights into reasoning performance compared to\n",
+      "traditional end-task accuracy metrics. Third, we present a new multimodal\n",
+      "visual reasoning model, named LlamaV-o1, trained using a multi-step curriculum\n",
+      "learning approach, where tasks are progressively organized to facilitate\n",
+      "incremental skill acquisition and problem-solving. The proposed LlamaV-o1 is\n",
+      "designed for multi-step reasoning and learns step-by-step through a structured\n",
+      "training paradigm. Extensive experiments show that our LlamaV-o1 outperforms\n",
+      "existing open-source models and performs favorably against close-source\n",
+      "proprietary models. Compared to the recent Llava-CoT, our LlamaV-o1 achieves an\n",
+      "average score of 67.3 with an absolute gain of 3.8\\% across six benchmarks\n",
+      "while being 5 times faster during inference scaling. Our benchmark, model, and\n",
+      "code are publicly available.\n",
       "\n",
-      "Title: Action-Agnostic Point-Level Supervision for Temporal Action Detection\n",
-      "Authors: Shuhei M. Yoshida, Takashi Shibata, Makoto Terao, Takayuki Okatani, Masashi Sugiyama\n",
-      "Summary: We propose action-agnostic point-level (AAPL) supervision for temporal action\n",
-      "detection to achieve accurate action instance detection with a lightly\n",
-      "annotated dataset. In the proposed scheme, a small portion of video frames is\n",
-      "sampled in an unsupervised manner and presented to human annotators, who then\n",
-      "label the frames with action categories. Unlike point-level supervision, which\n",
-      "requires annotators to search for every action instance in an untrimmed video,\n",
-      "frames to annotate are selected without human intervention in AAPL supervision.\n",
-      "We also propose a detection model and learning method to effectively utilize\n",
-      "the AAPL labels. Extensive experiments on the variety of datasets (THUMOS '14,\n",
-      "FineAction, GTEA, BEOID, and ActivityNet 1.3) demonstrate that the proposed\n",
-      "approach is competitive with or outperforms prior methods for video-level and\n",
-      "point-level supervision in terms of the trade-off between the annotation cost\n",
-      "and detection performance.\n",
+      "Title: ScooterLab: A Programmable and Participatory Sensing Research Testbed using Micromobility Vehicles\n",
+      "Authors: Ubaidullah Khan, Raveen Wijewickrama, Buddhi Ashan M. K., A. H. M. Nazmus Sakib, Khoi Trinh, Christina Duthie, Nima Najafian, Ahmer Patel, R. N. Molina, Anindya Maiti, Sushil K. Prasad, Greg P. Griffin, Murtuza Jadliwala\n",
+      "Summary: Micromobility vehicles, such as e-scooters, are increasingly popular in urban\n",
+      "communities but present significant challenges in terms of road safety, user\n",
+      "privacy, infrastructure planning, and civil engineering. Addressing these\n",
+      "critical issues requires a large-scale and easily accessible research\n",
+      "infrastructure to collect diverse mobility and contextual data from\n",
+      "micromobility users in realistic settings. To this end, we present ScooterLab,\n",
+      "a community research testbed comprising a fleet of customizable battery-powered\n",
+      "micromobility vehicles retrofitted with advanced sensing, communication, and\n",
+      "control capabilities. ScooterLab enables interdisciplinary research at the\n",
+      "intersection of computing, mobility, and urban planning by providing\n",
+      "researchers with tools to design and deploy customized sensing experiments and\n",
+      "access curated datasets. The testbed will enable advances in machine learning,\n",
+      "privacy, and urban transportation research while promoting sustainable\n",
+      "mobility.\n",
       "\n",
-      "Title: SoS Certificates for Sparse Singular Values and Their Applications: Robust Statistics, Subspace Distortion, and More\n",
-      "Authors: Ilias Diakonikolas, Samuel B. Hopkins, Ankit Pensia, Stefan Tiegel\n",
-      "Summary: We study $\\textit{sparse singular value certificates}$ for random rectangular\n",
-      "matrices. If $M$ is an $n \\times d$ matrix with independent Gaussian entries,\n",
-      "we give a new family of polynomial-time algorithms which can certify upper\n",
-      "bounds on the maximum of $\\|M u\\|$, where $u$ is a unit vector with at most\n",
-      "$\\eta n$ nonzero entries for a given $\\eta \\in (0,1)$. This basic algorithmic\n",
-      "primitive lies at the heart of a wide range of problems across algorithmic\n",
-      "statistics and theoretical computer science.\n",
-      "  Our algorithms certify a bound which is asymptotically smaller than the naive\n",
-      "one, given by the maximum singular value of $M$, for nearly the widest-possible\n",
-      "range of $n,d,$ and $\\eta$. Efficiently certifying such a bound for a range of\n",
-      "$n,d$ and $\\eta$ which is larger by any polynomial factor than what is achieved\n",
-      "by our algorithm would violate lower bounds in the SQ and low-degree\n",
-      "polynomials models. Our certification algorithm makes essential use of the\n",
-      "Sum-of-Squares hierarchy. To prove the correctness of our algorithm, we develop\n",
-      "a new combinatorial connection between the graph matrix approach to analyze\n",
-      "random matrices with dependent entries, and the Efron-Stein decomposition of\n",
-      "functions of independent random variables.\n",
-      "  As applications of our certification algorithm, we obtain new efficient\n",
-      "algorithms for a wide range of well-studied algorithmic tasks. In algorithmic\n",
-      "robust statistics, we obtain new algorithms for robust mean and covariance\n",
-      "estimation with tradeoffs between breakdown point and sample complexity, which\n",
-      "are nearly matched by SQ and low-degree polynomial lower bounds (that we\n",
-      "establish). We also obtain new polynomial-time guarantees for certification of\n",
-      "$\\ell_1/\\ell_2$ distortion of random subspaces of $\\mathbb{R}^n$ (also with\n",
-      "nearly matching lower bounds), sparse principal component analysis, and\n",
-      "certification of the $2\\rightarrow p$ norm of a random matrix.\n",
+      "Title: Machine Learning Force-Field Approach for Itinerant Electron Magnets\n",
+      "Authors: Sheng Zhang, Yunhao Fan, Kotaro Shimizu, Gia-Wei Chern\n",
+      "Summary: We review the recent development of machine-learning (ML) force-field\n",
+      "frameworks for Landau-Lifshitz-Gilbert (LLG) dynamics simulations of itinerant\n",
+      "electron magnets, focusing on the general theory and implementations of\n",
+      "symmetry-invariant representations of spin configurations. The crucial\n",
+      "properties that such magnetic descriptors must satisfy are differentiability\n",
+      "with respect to spin rotations and invariance to both lattice point-group\n",
+      "symmetry and internal spin rotation symmetry. We propose an efficient\n",
+      "implementation based on the concept of reference irreducible representations,\n",
+      "modified from the group-theoretical power-spectrum and bispectrum methods. The\n",
+      "ML framework is demonstrated using the s-d models, which are widely applied in\n",
+      "spintronics research. We show that LLG simulations based on local fields\n",
+      "predicted by the trained ML models successfully reproduce representative\n",
+      "non-collinear spin structures, including 120$^\\circ$, tetrahedral, and skyrmion\n",
+      "crystal orders of the triangular-lattice s-d models. Large-scale thermal quench\n",
+      "simulations enabled by ML models further reveal intriguing freezing dynamics\n",
+      "and glassy stripe states consisting of skyrmions and bi-merons. Our work\n",
+      "highlights the utility of ML force-field approach to dynamical modeling of\n",
+      "complex spin orders in itinerant electron magnets.\n",
       "\n",
-      "Title: Distributed Mixture-of-Agents for Edge Inference with Large Language Models\n",
-      "Authors: Purbesh Mitra, Priyanka Kaswan, Sennur Ulukus\n",
-      "Summary: Mixture-of-Agents (MoA) has recently been proposed as a method to enhance\n",
-      "performance of large language models (LLMs), enabling multiple individual LLMs\n",
-      "to work together for collaborative inference. This collaborative approach\n",
-      "results in improved responses to user prompts compared to relying on a single\n",
-      "LLM. In this paper, we consider such an MoA architecture in a distributed\n",
-      "setting, where LLMs operate on individual edge devices, each uniquely\n",
-      "associated with a user and equipped with its own distributed computing power.\n",
-      "These devices exchange information using decentralized gossip algorithms,\n",
-      "allowing different device nodes to talk without the supervision of a\n",
-      "centralized server. In the considered setup, different users have their own LLM\n",
-      "models to address user prompts. Additionally, the devices gossip either their\n",
-      "own user-specific prompts or augmented prompts to generate more refined answers\n",
-      "to certain queries. User prompts are temporarily stored in the device queues\n",
-      "when their corresponding LLMs are busy. Given the memory limitations of edge\n",
-      "devices, it is crucial to ensure that the average queue sizes in the system\n",
-      "remain bounded. In this paper, we address this by theoretically calculating the\n",
-      "queuing stability conditions for the device queues under reasonable\n",
-      "assumptions, which we validate experimentally as well. Further, we demonstrate\n",
-      "through experiments, leveraging open-source LLMs for the implementation of\n",
-      "distributed MoA, that certain MoA configurations produce higher-quality\n",
-      "responses compared to others, as evaluated on AlpacaEval 2.0 benchmark. The\n",
-      "implementation is available at:\n",
-      "https://github.com/purbeshmitra/distributed_moa.\n",
+      "Title: Meta-Learning for Physically-Constrained Neural System Identification\n",
+      "Authors: Ankush Chakrabarty, Gordon Wichern, Vedang M. Deshpande, Abraham P. Vinod, Karl Berntorp, Christopher R. Laughman\n",
+      "Summary: We present a gradient-based meta-learning framework for rapid adaptation of\n",
+      "neural state-space models (NSSMs) for black-box system identification. When\n",
+      "applicable, we also incorporate domain-specific physical constraints to improve\n",
+      "the accuracy of the NSSM. The major benefit of our approach is that instead of\n",
+      "relying solely on data from a single target system, our framework utilizes data\n",
+      "from a diverse set of source systems, enabling learning from limited target\n",
+      "data, as well as with few online training iterations. Through benchmark\n",
+      "examples, we demonstrate the potential of our approach, study the effect of\n",
+      "fine-tuning subnetworks rather than full fine-tuning, and report real-world\n",
+      "case studies to illustrate the practical application and generalizability of\n",
+      "the approach to practical problems with physical-constraints. Specifically, we\n",
+      "show that the meta-learned models result in improved downstream performance in\n",
+      "model-based state estimation in indoor localization and energy systems.\n",
       "\n",
-      "Title: Sparse chaos in cortical circuits\n",
-      "Authors: Rainer Engelken, Michael Monteforte, Fred Wolf\n",
-      "Summary: Nerve impulses, the currency of information flow in the brain, are generated\n",
-      "by an instability of the neuronal membrane potential dynamics. Neuronal\n",
-      "circuits exhibit collective chaos that appears essential for learning, memory,\n",
-      "sensory processing, and motor control. However, the factors controlling the\n",
-      "nature and intensity of collective chaos in neuronal circuits are not well\n",
-      "understood. Here we use computational ergodic theory to demonstrate that basic\n",
-      "features of nerve impulse generation profoundly affect collective chaos in\n",
-      "neuronal circuits. Numerically exact calculations of Lyapunov spectra,\n",
-      "Kolmogorov-Sinai-entropy, and upper and lower bounds on attractor dimension\n",
-      "show that changes in nerve impulse generation in individual neurons moderately\n",
-      "impact information encoding rates but qualitatively transform phase space\n",
-      "structure. Specifically, we find a drastic reduction in the number of unstable\n",
-      "manifolds, Kolmogorov-Sinai entropy, and attractor dimension. Beyond a critical\n",
-      "point, marked by the simultaneous breakdown of the diffusion approximation, a\n",
-      "peak in the largest Lyapunov exponent, and a localization transition of the\n",
-      "leading covariant Lyapunov vector, networks exhibit sparse chaos: prolonged\n",
-      "periods of near stable dynamics interrupted by short bursts of intense chaos.\n",
-      "Analysis of large, more realistically structured networks supports the\n",
-      "generality of these findings. In cortical circuits, biophysical properties\n",
-      "appear tuned to this regime of sparse chaos. Our results reveal a close link\n",
-      "between fundamental aspects of single-neuron biophysics and the collective\n",
-      "dynamics of cortical circuits, suggesting that nerve impulse generation\n",
-      "mechanisms are adapted to enhance circuit controllability and information flow.\n",
+      "Title: Statistical Challenges in Analyzing Migrant Backgrounds Among University Students: a Case Study from Italy\n",
+      "Authors: Lorenzo Giammei, Laura Terzera, Fulvia Mecatti\n",
+      "Summary: The methodological issues and statistical complexities of analyzing\n",
+      "university students with migrant backgrounds is explored, focusing on Italian\n",
+      "data from the University of Milano-Bicocca. With the increasing size of migrant\n",
+      "populations and the growth of the second and middle generations, the need has\n",
+      "risen for deeper knowledge of the various strata of this population, including\n",
+      "university students with migrant backgrounds. This presents challenges due to\n",
+      "inconsistent recording in university datasets. By leveraging both\n",
+      "administrative records and an original targeted survey we propose a methodology\n",
+      "to fully identify the study population of students with migrant histories, and\n",
+      "to distinguish relevant subpopulations within it such as second-generation born\n",
+      "in Italy. Traditional logistic regression and machine learning random forest\n",
+      "models are used and compared to predict migrant status. The primary\n",
+      "contribution lies in creating an expanded administrative dataset enriched with\n",
+      "indicators of students' migrant backgrounds and status. The expanded dataset\n",
+      "provides a critical foundation for analyzing the characteristics of students\n",
+      "with migration histories across all variables routinely registered in the\n",
+      "administrative data set. Additionally, findings highlight the presence of\n",
+      "selection bias in the targeted survey data, underscoring the need of further\n",
+      "research.\n",
       "\n"
      ]
     }
@@ -258,7 +238,7 @@
      "text": [
       "INFO:floki.document.fetcher.arxiv:Searching for query: all:(agents AND cybersecurity)\n",
       "INFO:arxiv:Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=all%3A%28agents+AND+cybersecurity%29&id_list=&sortBy=submittedDate&sortOrder=descending&start=0&max_results=100\n",
-      "INFO:arxiv:Got first page: 93 of 93 total results\n",
+      "INFO:arxiv:Got first page: 94 of 94 total results\n",
       "INFO:floki.document.fetcher.arxiv:Found 10 results for query: all:(agents AND cybersecurity)\n"
      ]
     },
@@ -266,6 +246,34 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "Title: What is a Social Media Bot? A Global Comparison of Bot and Human Characteristics\n",
+      "Authors: Lynnette Hui Xian Ng, Kathleen M. Carley\n",
+      "Summary: Chatter on social media is 20% bots and 80% humans. Chatter by bots and\n",
+      "humans is consistently different: bots tend to use linguistic cues that can be\n",
+      "easily automated while humans use cues that require dialogue understanding.\n",
+      "Bots use words that match the identities they choose to present, while humans\n",
+      "may send messages that are not related to the identities they present. Bots and\n",
+      "humans differ in their communication structure: sampled bots have a star\n",
+      "interaction structure, while sampled humans have a hierarchical structure.\n",
+      "These conclusions are based on a large-scale analysis of social media tweets\n",
+      "across ~200mil users across 7 events. Social media bots took the world by storm\n",
+      "when social-cybersecurity researchers realized that social media users not only\n",
+      "consisted of humans but also of artificial agents called bots. These bots wreck\n",
+      "havoc online by spreading disinformation and manipulating narratives. Most\n",
+      "research on bots are based on special-purposed definitions, mostly predicated\n",
+      "on the event studied. This article first begins by asking, \"What is a bot?\",\n",
+      "and we study the underlying principles of how bots are different from humans.\n",
+      "We develop a first-principle definition of a social media bot. With this\n",
+      "definition as a premise, we systematically compare characteristics between bots\n",
+      "and humans across global events, and reflect on how the software-programmed bot\n",
+      "is an Artificial Intelligent algorithm, and its potential for evolution as\n",
+      "technology advances. Based on our results, we provide recommendations for the\n",
+      "use and regulation of bots. Finally, we discuss open challenges and future\n",
+      "directions: Detect, to systematically identify these automated and potentially\n",
+      "evolving bots; Differentiate, to evaluate the goodness of the bot in terms of\n",
+      "their content postings and relationship interactions; Disrupt, to moderate the\n",
+      "impact of malicious bots.\n",
+      "\n",
       "Title: SecBench: A Comprehensive Multi-Dimensional Benchmarking Dataset for LLMs in Cybersecurity\n",
       "Authors: Pengfei Jing, Mengyun Tang, Xiaorong Shi, Xing Zheng, Sen Nie, Shi Wu, Yong Yang, Xiapu Luo\n",
       "Summary: Evaluating Large Language Models (LLMs) is crucial for understanding their\n",
@@ -283,8 +291,8 @@
       "high-quality data from open sources and organizing a Cybersecurity Question\n",
       "Design Contest, resulting in 44,823 MCQs and 3,087 SAQs. Particularly, we used\n",
       "the powerful while cost-effective LLMs to (1). label the data and (2).\n",
-      "constructing a grading agent for automatic evaluation of SAQs.Benchmarking\n",
-      "results on 13 SOTA LLMs demonstrate the usability of SecBench, which is\n",
+      "constructing a grading agent for automatic evaluation of SAQs. Benchmarking\n",
+      "results on 16 SOTA LLMs demonstrate the usability of SecBench, which is\n",
       "arguably the largest and most comprehensive benchmark dataset for LLMs in\n",
       "cybersecurity. More information about SecBench can be found at our website, and\n",
       "the dataset can be accessed via the artifact link.\n",
@@ -443,22 +451,6 @@
       "Specifically, we characterize the types of attackers and defenders in the sense\n",
       "of Bayesian games and, using reinforcement learning, derive empirical findings\n",
       "about how to best train agents that defend against multiple types of attackers.\n",
-      "\n",
-      "Title: Multi-Agent Collaboration in Incident Response with Large Language Models\n",
-      "Authors: Zefang Liu\n",
-      "Summary: Incident response (IR) is a critical aspect of cybersecurity, requiring rapid\n",
-      "decision-making and coordinated efforts to address cyberattacks effectively.\n",
-      "Leveraging large language models (LLMs) as intelligent agents offers a novel\n",
-      "approach to enhancing collaboration and efficiency in IR scenarios. This paper\n",
-      "explores the application of LLM-based multi-agent collaboration using the\n",
-      "Backdoors & Breaches framework, a tabletop game designed for cybersecurity\n",
-      "training. We simulate real-world IR dynamics through various team structures,\n",
-      "including centralized, decentralized, and hybrid configurations. By analyzing\n",
-      "agent interactions and performance across these setups, we provide insights\n",
-      "into optimizing multi-agent collaboration for incident response. Our findings\n",
-      "highlight the potential of LLMs to enhance decision-making, improve\n",
-      "adaptability, and streamline IR processes, paving the way for more effective\n",
-      "and coordinated responses to cyber threats.\n",
       "\n"
      ]
     }
@@ -490,7 +482,7 @@
      "text": [
       "INFO:floki.document.fetcher.arxiv:Searching for query: all:(quantum NOT computing)\n",
       "INFO:arxiv:Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=all%3A%28quantum+NOT+computing%29&id_list=&sortBy=submittedDate&sortOrder=descending&start=0&max_results=100\n",
-      "INFO:arxiv:Got first page: 100 of 355744 total results\n",
+      "INFO:arxiv:Got first page: 100 of 356279 total results\n",
       "INFO:floki.document.fetcher.arxiv:Found 10 results for query: all:(quantum NOT computing)\n"
      ]
     },
@@ -498,198 +490,186 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Title: Holographic observers for time-band algebras\n",
-      "Authors: Kristan Jensen, Suvrat Raju, Antony J. Speranza\n",
-      "Summary: We study the algebra of observables in a time band on the boundary of anti-de\n",
-      "Sitter space in a theory of quantum gravity. Strictly speaking this algebra\n",
-      "does not have a commutant because products of operators within the time band\n",
-      "give rise to operators outside the time band. However, we show that in a state\n",
-      "where the bulk contains a macroscopic observer, it is possible to define a\n",
-      "coarse-grained version of this algebra with a non-trivial commutant, and a\n",
-      "resolution limited by the observer's characteristics. This algebra acts on a\n",
-      "little Hilbert space that describes excitations about the observer's state and\n",
-      "time-translated versions of this state. Our construction requires a choice of\n",
-      "dressing that determines how elements of the algebra transform under the\n",
-      "Hamiltonian. At leading order in gravitational perturbation theory, and with a\n",
-      "specific choice of dressing, our construction reduces to the modular\n",
-      "crossed-product described previously in the literature. We also prove a theorem\n",
-      "showing that this is the only crossed product of a type III$_1$ algebra\n",
-      "resulting in an algebra with a trace. This trace can be used to define entropy\n",
-      "differences between states in the little Hilbert space that are insensitive to\n",
-      "the properties of the observer. We discuss some technical challenges in\n",
-      "extending this construction to higher orders in perturbation theory. Lastly, we\n",
-      "review the construction of interior operators in the eternal black hole and\n",
-      "show that they can be written as elements of a crossed product algebra.\n",
+      "Title: Algebraic solutions for $SU(2)\\otimes SU(2)$ Hamiltonian eigensystems: generic statistical ensembles and a mesoscopic system application\n",
+      "Authors: Alex E. Bernardini, Roldao da Rocha\n",
+      "Summary: Solutions of generic $SU(2)\\otimes SU(2)$ Hamiltonian eigensystems are\n",
+      "obtained through systematic manipulations of quartic polynomial equations. An\n",
+      "{\\em ansatz} for constructing separable and entangled eigenstate basis,\n",
+      "depending on the quartic equation coefficients, is proposed. Besides the\n",
+      "quantum concurrence for pure entangled states, the associated thermodynamic\n",
+      "statistical ensembles, their partition function, quantum purity and quantum\n",
+      "concurrence are shown to be straightforwardly obtained. Results are specialized\n",
+      "to a $SU(2)\\otimes SU(2)$ structure emulated by lattice-layer degrees of\n",
+      "freedom of the Bernal stacked graphene, in a context that can be extended to\n",
+      "several mesoscopic scale systems for which the onset from $SU(2)\\otimes SU(2)$\n",
+      "Hamiltonians has been assumed.\n",
       "\n",
-      "Title: Enhanced Two-Way Teleportation of Entangled States with Six-Qubit Cluster State\n",
-      "Authors: Vedhanayagi R, Soubhik De, Basherrudin Mahmud Ahmed A, Alok Sharan\n",
-      "Summary: This work presents a two-way teleportation protocol for the transfer of an\n",
-      "unknown two-qubit quantum state between two parties Alice and Bob, utilizing a\n",
-      "six-qubit cluster state. This bidirectional exchange is achieved by performing\n",
-      "Bell measurements on the qubit pairs of Alice and Bob, ensuring the successful\n",
-      "teleportation of the quantum state for both parties. We demonstrate the\n",
-      "proposed protocol by designing a teleportation circuit that incorporates the\n",
-      "necessary quantum gates. The fidelity of the teleportation process is evaluated\n",
-      "through simulations, confirming the accuracy and reliability of the proposed\n",
-      "scheme. The protocol restores teleported states without requiring CNOT\n",
-      "operations or auxiliary qubits, offering a significant advantage in resource\n",
-      "efficiency(utilization). A comparative analysis of the intrinsic efficiency\n",
-      "with previous approaches establishes that the proposed protocol brings forth an\n",
-      "efficient approach for achieving two-way quantum teleportation.\n",
+      "Title: Measuring Non-Gaussian Magic in Fermions: Convolution, Entropy, and the Violation of Wick's Theorem and the Matchgate Identity\n",
+      "Authors: Luke Coffman, Graeme Smith, Xun Gao\n",
+      "Summary: Classically hard to simulate quantum states, or \"magic states\", are\n",
+      "prerequisites to quantum advantage, highlighting an apparent separation between\n",
+      "classically and quantumly tractable problems. Classically simulable states such\n",
+      "as Clifford circuits on stabilizer states, free bosonic states, free fermions,\n",
+      "and matchgate circuits are all in some sense Gaussian. While free bosons and\n",
+      "fermions arise from quadratic Hamiltonians, recent works have demonstrated that\n",
+      "bosonic and qudit systems converge to Gaussians and stabilizers under\n",
+      "convolution. In this work, we similarly identify convolution for fermions and\n",
+      "find efficient measures of non-Gaussian magic in pure fermionic states. We\n",
+      "demonstrate that three natural notions for the Gaussification of a state, (1)\n",
+      "the Gaussian state with the same covariance matrix, (2) the fixed point of\n",
+      "convolution, and (3) the closest Gaussian in relative entropy, coincide by\n",
+      "proving a central limit theorem for fermionic systems. We then utilize the\n",
+      "violation of Wick's theorem and the matchgate identity to quantify non-Gaussian\n",
+      "magic in addition to a SWAP test.\n",
       "\n",
-      "Title: Causality and Stability from Acoustic Geometry\n",
-      "Authors: Ignacy Sawicki, Georg Trenkler, Alexander Vikman\n",
-      "Summary: Scalar-tensor theories with derivative interactions form backgrounds which\n",
-      "spontaneously break Lorentz invariance. We investigate the dynamics of free\n",
-      "scalar perturbations on general anisotropic backgrounds. We demonstrate that\n",
-      "the phonons move on null geodesics of an acoustic spacetime described by its\n",
-      "own metric and own connection featuring nonmetricity with respect to the usual\n",
-      "spacetime metric. We give distinct physical interpretations to the acoustic\n",
-      "metric and its inverse. The first defines rays and their phase velocities. The\n",
-      "latter defines momenta and the dispersion relation. We classify possible\n",
-      "acoustic geometries and provide a physical interpretation for them.\n",
-      "  We discuss the phonon properties that moving observers, inequivalent owing to\n",
-      "the breaking of Lorentz invariance, would measure. Ghosts and true gradient\n",
-      "instabilities are to be read off from invariant properties of the acoustic\n",
-      "metric - its signature and determinant. However, the choice of the observer's\n",
-      "frame can cause some confusion and paradoxes, including apparent instabilities.\n",
-      "For instance, complex phonon energies can appear entirely due to the\n",
-      "ill-posedness of the Cauchy problem in the frame chosen. On the other hand,\n",
-      "unbounded negative phonon energies can appear, without ghosts or gradient\n",
-      "instabilities, for observers moving supersonically, when phonon Cherenkov\n",
-      "radiation can be emitted.\n",
-      "  The action for phonons also gives an acoustically covariantly conserved\n",
-      "energy-momentum tensor (EMT) which is, however, not conserved in the usual\n",
-      "spacetime. Nonetheless, in the presence of an acoustic timelike Killing vector,\n",
-      "the acoustic Hamiltonian functional is a conserved charge in both the acoustic\n",
-      "and in the usual spacetimes, and even has the same value in both. Thus, the\n",
-      "acoustic Hamiltonian can be used to bound the motion of phonons interacting\n",
-      "with other species living in the usual spacetime.\n",
+      "Title: Randomized benchmarking with non-Markovian noise and realistic finite-time gates\n",
+      "Authors: Antoine Brillant, Peter Groszkowski, Alireza Seif, Jens Koch, Aashish Clerk\n",
+      "Summary: We analyze the impact of non-Markovian classical noise on single-qubit\n",
+      "randomized benchmarking experiments, in a manner that explicitly models the\n",
+      "realization of each gate via realistic finite-duration pulses. Our new\n",
+      "framework exploits the random nature of each gate sequence to derive\n",
+      "expressions for the full survival probability decay curve which are\n",
+      "non-perturbative in the noise strength. In the presence of non-Markovian noise,\n",
+      "our approach shows that the decay curve can exhibit a strong dependence on the\n",
+      "implementation method, with regimes of both exponential and power law decays.\n",
+      "We discuss how these effects can complicate the interpretation of a\n",
+      "randomized-benchmarking experiment, but also how to leverage them to probe\n",
+      "non-Markovianty.\n",
       "\n",
-      "Title: Junction conditions for higher order gravity theories from a Gibbons-Hawking-York boundary term\n",
-      "Authors: Marcos A. Ramirez, Cristián Martínez\n",
-      "Summary: In this work we study the problem of generalizing the Gibbons-Hawking-York\n",
-      "boundary terms for general quadratic theories of gravity and develop a new\n",
-      "method to obtain them. From these terms we derive the junction conditions for a\n",
-      "subset of this family of theories that includes Gauss-Bonnet (GB) gravity. We\n",
-      "re-obtain the well-known results for GB theory, generalize them to other\n",
-      "quadratic theories and compare the resulting junction conditions with the ones\n",
-      "already derived in the literature using other methods.\n",
+      "Title: Supercharging Single-Atom Traps by Collisional Blockade\n",
+      "Authors: Mark IJspeert, Naomi Holland, Benjamin Yuen, Axel Kuhn\n",
+      "Summary: Reconfigurable arrays of trapped single atoms are an excellent platform for\n",
+      "the simulation of many-body physics and the realisation of high-fidelity\n",
+      "quantum gates. The confinement of atoms is often achieved with focussed laser\n",
+      "beams acting as optical dipole-force traps that allow for both static and\n",
+      "dynamic positioning of atoms. In these traps, light-assisted collisions --\n",
+      "enhancing the two-atom loss rate -- ensure that single atom occupation of traps\n",
+      "can be realised. However, the time-averaged probability of trapping a single\n",
+      "atom is limited to $0.5$ when loading directly from a surrounding cloud of\n",
+      "laser-cooled atoms, preventing deterministic filling of large arrays. In this\n",
+      "work, we demonstrate that increasing the depth of a static, optical dipole trap\n",
+      "enables the transition from fast loading on a timescale of $2.1\\,$s to an\n",
+      "extended trap lifetime of $7.9\\,$s. This method demonstrates an achievable\n",
+      "filling ratio of $(79\\pm2)\\,\\%$ without the need of rearranging atoms to fill\n",
+      "vacant traps.\n",
       "\n",
-      "Title: Cavity-QED Simulation of a Maser beyond the Mean-Field Approximation\n",
-      "Authors: Xinpeng Shu, Yining Jiang, Hao Wu, Mark Oxborrow\n",
-      "Summary: We here introduce a method for simulating, quantum mechanically, the dynamics\n",
-      "of a maser where the strength of the magnetic field of the microwave mode being\n",
-      "amplified by stimulated emission varies over the volume of the maser's\n",
-      "spatially extended gain medium. This is very often the case in real systems.\n",
-      "Our method generalizes the well-known Tavis-Cummings (T-C) model of cavity\n",
-      "quantum electrodynamics (QED) to encompass quantum emitters whose coupling\n",
-      "strengths to the maser's amplified mode vary over a distribution that can be\n",
-      "accurately determined using an electromagnetic-field solver applied to the\n",
-      "maser cavity's geometry and composition. We then solve our generalized T-C\n",
-      "model to second order in cumulant expansion using publicly available\n",
-      "Python-based software. We apply our methodology to a specific, experimentally\n",
-      "measured maser based on an optically pumped crystal of pentacene-doped\n",
-      "para-terphenyl. We demonstrate that certain distinct quantum-mechanical\n",
-      "features exhibited by this maser's dynamics, most notably the observation of\n",
-      "Rabi-like flopping associated with the generation of spin-photon Dicke states,\n",
-      "can be accurately reproduced using our numerically solved model. The equivalent\n",
-      "simpler model, that invokes the mean-field approximation, fails to do so. By\n",
-      "constructing then solving for artificial (perfectly Gaussian) distributions, we\n",
-      "go on to explore how the performance of this type of maser is affected by the\n",
-      "spread in spin-photon coupling strengths. Our methodology thereby enables the\n",
-      "maser's anatomy to be more rationally engineered.\n",
+      "Title: Multilingual Performance of a Multimodal Artificial Intelligence System on Multisubject Physics Concept Inventories\n",
+      "Authors: Gerd Kortemeyer, Marina Babayeva, Giulia Polverini, Bor Gregorcic, Ralf Widenhorn\n",
+      "Summary: We investigate the multilingual and multimodal performance of a large\n",
+      "language model-based artificial intelligence (AI) system, GPT-4o, on a diverse\n",
+      "set of physics concept inventories spanning multiple languages and subject\n",
+      "areas. The inventories taken from the PhysPort website cover the classical\n",
+      "physics topics of mechanics, electromagnetism, optics, and thermodynamics as\n",
+      "well as relativity, quantum mechanics, astronomy, mathematics, and laboratory\n",
+      "skills. Unlike previous text-only studies, we uploaded the inventories as\n",
+      "images mirroring what a student would see on paper, assessing the system's\n",
+      "multimodal functionality. The AI is prompted in English and autonomously\n",
+      "chooses the language of its response - either remaining in the nominal language\n",
+      "of the test, switching entirely to English, or mixing languages - revealing\n",
+      "adaptive behavior dependent on linguistic complexity and data availability. Our\n",
+      "results indicate some variation in performance across subject areas, with\n",
+      "laboratory skills standing out as the area of poorest performance. Furthermore,\n",
+      "the AI's performance on questions that require visual interpretation of images\n",
+      "is worse than on purely text-based questions. Questions that are difficult for\n",
+      "the AI tend to be that way invariably of the inventory language. We also find\n",
+      "large variations in performance across languages, with some appearing to\n",
+      "benefit substantially from language switching, a phenomenon similar to\n",
+      "code-switching ofhuman speakers. Overall, comparing the obtained AI results to\n",
+      "the existing literature, we find that the AI system outperforms average\n",
+      "undergraduate students post-instruction in all subject areas but laboratory\n",
+      "skills.\n",
       "\n",
-      "Title: Quantum uncertainty in the area of a black hole\n",
-      "Authors: Maulik Parikh, Jude Pereira\n",
-      "Summary: Quantum fluctuations of the spacetime metric induce an uncertainty in the\n",
-      "horizon area of a black hole. Working in linearized quantum gravity, we derive\n",
-      "the variance in the area of a four-dimensional Schwarzschild black hole from\n",
-      "the renormalized graviton propagator. We find that the standard deviation of\n",
-      "the horizon area scales as the product of the Schwarzschild radius and the\n",
-      "Planck length. For macroscopic black holes, the quantum uncertainty is\n",
-      "therefore enormous in Planck units.\n",
+      "Title: Hidden entanglement in twin beams generated through optical parametric amplification in hot alkali atoms\n",
+      "Authors: R. L. Rincón Celis, G. Nirala, A. Montaña Guerrero, T. L. Meireles, M. Martinelli, A. M. Marino, H. M. Florez\n",
+      "Summary: Proper characterization of quantum correlations in a multimode optical state\n",
+      "is critical for applications in quantum information science; however, the most\n",
+      "common entanglement measurements can lead to an incomplete state\n",
+      "reconstruction. This is the case for the ubiquitous spectral measurement of\n",
+      "field quadratures for which a full characterization of the quantum correlations\n",
+      "between optical beams is not possible. We demonstrate this effect in twin beams\n",
+      "generated through parametric amplification by four-wave mixing in hot rubidium\n",
+      "vapor, showing the role of a frequency dependent gain response. We implement a\n",
+      "resonator-based measurement that reveals entanglement between beams that is\n",
+      "otherwise hidden by usual spectral measurements. Additionally, this system\n",
+      "shows how the phase shifts between the carrier and the sidebands on the\n",
+      "involved fields affect the observation of entanglement for different\n",
+      "entanglement witnesses, demonstrating the relevance of making a complete state\n",
+      "tomography.\n",
       "\n",
-      "Title: Accidental Peccei-Quinn Symmetry From Gauged U(1) and a High Quality Axion\n",
-      "Authors: K. S. Babu, Bhaskar Dutta, Rabindra N. Mohapatra\n",
-      "Summary: We construct explicit models that solve the axion quality problem originating\n",
-      "from quantum gravitational effects. The general strategy we employ is to\n",
-      "supplement the Standard Model and its grand unified extensions by an\n",
-      "anomaly-free axial $U(1)_a$ symmetry that is gauged. We show that for several\n",
-      "choices of the gauge quantum numbers of the fermions, this setup leads to an\n",
-      "accidental $U(1)$ symmetry with a QCD anomaly which is identified as the\n",
-      "Peccei-Quinn (PQ) symmetry that solves the strong CP problem. The $U(1)_a$\n",
-      "gauge symmetry controls the amount of explicit PQ symmetry violation induced by\n",
-      "quantum gravity, resulting in a high quality axion. We present two classes of\n",
-      "models employing this strategy. In the first class (models I and II), the axial\n",
-      "$U(1)_a$ gauge symmetry acts on vector-like quarks leading to an accidental\n",
-      "KSVZ-type axion. The second class (model III) is based on $SO(10)$ grand\n",
-      "unified theory extended by a gauged $U(1)_a$ symmetry that leads to a hybrid\n",
-      "KSVZ--DFSZ type axion. The couplings of the axion to the electron and the\n",
-      "nucleon are found to be distinct in this class of hybrid models from those in\n",
-      "the KSVZ and DFSZ models, which can be used to test these models.\n",
-      "Interestingly, all models presented here have domain wall number of one, which\n",
-      "is free of cosmological problems that typically arise in axion models.\n",
+      "Title: Quantum networks using rare-earth ions\n",
+      "Authors: Wolfgang Tittel, Mikael Afzelius, Adam Kinos, Lars Rippe, Andreas Walther\n",
+      "Summary: We review concepts and recent work related to creating light-matter\n",
+      "interfaces for future quantum networks based on rare-earth ion-doped crystals.\n",
+      "More precisely, we explore their unique suitability for creating photon\n",
+      "sources, optical quantum memories for light, and qubits that allow quantum\n",
+      "information processing. In addition, we review the state-of-the-art of\n",
+      "elementary quantum repeater links, and provide suggestions for future research.\n",
       "\n",
-      "Title: Particle-Soliton Degeneracy in 2D Quantum Chromodynamics\n",
-      "Authors: Clay Cordova, Diego García-Sepúlveda, Nicholas Holfester\n",
-      "Summary: Quantum chromodynamics in two spacetime dimensions admits a finite\n",
-      "non-invertible symmetry described mathematically by a fusion category. This\n",
-      "symmetry is spontaneously broken at long distances, leading to distinct vacua.\n",
-      "When the theory has a mass gap, the spectrum is therefore characterized by\n",
-      "particle excitations above a single vacuum and soliton sectors interpolating\n",
-      "between vacua. We use anyon condensation and the representation theory of\n",
-      "fusion categories to obtain exact results about this spectrum, exhibiting the\n",
-      "allowed multiplets. Often, particles and solitons are in the same\n",
-      "representation and therefore must have equal masses. Furthermore, the fusion\n",
-      "category symmetry frequently implies the existence of certain stable states in\n",
-      "the spectrum. The resulting degeneracies are encoded in quiver diagrams where\n",
-      "nodes are vacua and arrows are excited states.\n",
+      "Title: Pushing limits: Probing new gravity using a satellite constellation\n",
+      "Authors: Viktor T. Toth\n",
+      "Summary: Building upon earlier work, we explore the limits of using a configuration of\n",
+      "satellites to measure the trace of the gravitational gradient tensor using\n",
+      "intersatellite laser ranging and timing observables without relying on\n",
+      "high-precision external observables such as deep space radio navigation or\n",
+      "astrometry with unrealistic accuracy. A refined model, calculated with extended\n",
+      "numerical precision, confirms that exceptional sensitivity is possible, placing\n",
+      "within reach observational tests of certain modified gravity theories (e.g.,\n",
+      "Yukawa terms, galileons) using heliocentric orbits in the vicinity of the\n",
+      "Earth. The sensitivity of the experiment improves at larger heliocentric\n",
+      "distances. A constellation placed at 30 astronomical units, still well within\n",
+      "the domain of feasibility using available propulsion and deep space\n",
+      "communication technologies, may approach sensitivities that are sufficient to\n",
+      "detect not just the gravitational contribution of the interplanetary medium but\n",
+      "perhaps even cosmological dark matter and dark energy constituents.\n",
       "\n",
-      "Title: Topological dark energy from black-hole formations and mergers through the gravity-thermodynamics approach\n",
-      "Authors: Stylianos A. Tsilioukas, Nicholas Petropoulos, Emmanuel N. Saridakis\n",
-      "Summary: We apply the gravity-thermodynamics approach in the case of\n",
-      "Einstein-Gauss-Bonnet theory, and its corresponding Wald-Gauss-Bonnet entropy,\n",
-      "which due to the Chern-Gauss-Bonnet theorem it is related to the Euler\n",
-      "characteristic of the Universe topology. However, we consider the realistic\n",
-      "scenario where we have the formation and merger of black holes that lead to\n",
-      "topology changes, which induce entropy changes in the Universe horizon. We\n",
-      "extract the modified Friedmann equations and we obtain an effective dark energy\n",
-      "sector of topological origin. We estimate the black-hole formation and merger\n",
-      "rates starting from the observed star formation rate per redshift, which is\n",
-      "parametrized very efficiently by the Madau-Dickinson form, and finally we\n",
-      "result to a dark-energy energy density that depends on the cosmic star\n",
-      "formation rate density, on the fraction $f_{\\text{BH}}$ of stars forming black\n",
-      "holes, on the fraction of black holes $f_\\text{merge}$ that eventually merge,\n",
-      "on the fraction $ f_{\\text{bin}}$ of massive stars that are in binaries, on the\n",
-      "average mass of progenitor stars that will evolve to form black holes $ \\langle\n",
-      "m_{\\text{prog}} \\rangle $, as well as on the Gauss-Bonnet coupling constant. We\n",
-      "investigate in detail the cosmological evolution, obtaining the usual thermal\n",
-      "history. Concerning the dark-energy equation-of-state parameter, we show that\n",
-      "at intermediate redshifts it exhibits phantom-like or quintessence-like\n",
-      "behavior according to the sign of the Gauss-Bonnet coupling, while at early and\n",
-      "late times it tends to the cosmological constant value. Finally, we study the\n",
-      "effect of the other model parameters, showing that for the whole allowed\n",
-      "observationally estimated ranges, the topological dark-energy equation-of-state\n",
-      "parameter remains within its observational bounds.\n",
+      "Title: Theory for the Rydberg states of helium: Comparison with experiment for the $1s24p\\;^1P_1$ state ($n=24$)\n",
+      "Authors: Aaron T. Bondy, G. W. F. Drake, Cody McLeod, Evan M. R. Petrimoulx, Xiao-Qiu Qi, Zhen-Xiang Zhong\n",
+      "Summary: Recent measurements of the ionization energies of the Rydberg $^1P$ states of\n",
+      "helium for principal quantum number $n = 24$ and higher present a new challenge\n",
+      "to theoretical atomic physics. A long-standing obstacle to high precision\n",
+      "atomic theory for three-body systems is a rapid loss of accuracy for\n",
+      "variational calculations with increasing principal quantum number $n$. We show\n",
+      "that this problem can be overcome with the use of a ``triple\" basis set in\n",
+      "Hylleraas coordinates. Nonrelativistic energies accurate to 23 significant\n",
+      "figures are obtained with basis sets of relatively modest size (6744 terms).\n",
+      "Relativistic and quantum electrodynamic effects are calculated, including an\n",
+      "estimate of terms of order $m\\alpha^6$ from a $1/n^3$ extrapolation, resulting\n",
+      "in an estimated accuracy of $\\pm$1 kHz. The calculated ionization energy of\n",
+      "5704 980.348(1) MHz is in excellent agreement with the experimental value 5704\n",
+      "980.312(95) MHz. These results establish the ionization energy of the\n",
+      "$1s24p\\;^1P_1$ state as an absolute point of reference for transitions to\n",
+      "lower-lying states, and they confirm an $11\\sigma$ disagreement between theory\n",
+      "and experiment in the triplet spectrum of helium. Results are also given for\n",
+      "the $1s24p\\;^3P_J$ states in agreement with a recent experiment on the triplet\n",
+      "Rydberg series, thereby confirming a discrepancy of of $0.468 \\pm 0.055$ MHz\n",
+      "for the ionization energy of the $1s2s\\;^3S_1$ state.\n",
       "\n",
-      "Title: Gravitational EFT for dissipative open systems\n",
-      "Authors: Pak Hang Chris Lau, Kanji Nishii, Toshifumi Noumi\n",
-      "Summary: We elaborate on the effective field theory (EFT) construction for dissipative\n",
-      "open systems coupled to dynamical gravity, in light of recent developments on\n",
-      "the EFT of dissipative hydrodynamics (HydroEFT). Our construction is based on\n",
-      "the Schwinger-Keldysh formalism and its symmetries as well as microscopic\n",
-      "unitarity. A key aspect of dynamical gravity is that gravity couples to all\n",
-      "degrees of freedom universally, hence the EFT has to take into account the\n",
-      "energy-momentum tensor of the environment to which the energy escapes from the\n",
-      "dissipative system of interest. We incorporate this effect by modeling the\n",
-      "environment based on HydroEFT, assuming validity of the derivative expansion of\n",
-      "the environment sector. For illustration, we apply our EFT recipe to a\n",
-      "dissipative scalar field coupled to dynamical gravity that can be used, e.g.,\n",
-      "for dissipative inflation. In particular we quantify impacts of fluctuations in\n",
-      "the environment sector on the scalar dynamics. We also apply the same framework\n",
-      "to dissipative gravity, discussing dissipative gravitational waves and the\n",
-      "generalized second law of black hole thermodynamics.\n",
+      "Title: The Spectre of Underdetermination in Modern Cosmology\n",
+      "Authors: Pedro G. Ferreira, William J. Wolf, James Read\n",
+      "Summary: The scientific status of physical cosmology has been the subject of\n",
+      "philosophical debate ever since detailed mathematical models of the Universe\n",
+      "emerged from Einstein's general theory of relativity. Such debates revolve\n",
+      "around whether and to what extent cosmology meets established demarcation\n",
+      "criteria for a discipline to be scientific, as well as determining how to best\n",
+      "characterize cosmology as a science, given the unique challenges and\n",
+      "limitations faced by a discipline which aims to study the origin, composition,\n",
+      "and fate of the Universe itself. The present article revisits, in light of the\n",
+      "dramatic progress in cosmology in recent decades, an earlier debate held in the\n",
+      "1950s between Herman Bondi and Gerald Whitrow regarding the scientific status\n",
+      "of cosmology. We analyse cosmology's transition from an emerging science to a\n",
+      "cornerstone of modern physics, highlighting its empirical successes in\n",
+      "establishing the $\\Lambda$-Cold Dark Matter ($\\Lambda$CDM) model and in its\n",
+      "delivery of various successful novel predictions. Despite this remarkable\n",
+      "scientific success and progress, we argue that modern cosmology faces a further\n",
+      "profound challenge: the permanent underdetermination of the microphysical\n",
+      "nature of its exotic energy components: inflation, dark matter, and dark\n",
+      "energy. Drawing historical parallels with the role of spectroscopy in revealing\n",
+      "the microphysical nature of atomic physics, we argue that the epistemic\n",
+      "barriers obstructing us from ascertaining the microphysical nature of these\n",
+      "exotic energy components are significant, in turn casting doubt upon whether\n",
+      "cosmology can ever transcend these particular epistemic challenges. We conclude\n",
+      "by reflecting on the prospects for future breakthroughs and/or non-empirical\n",
+      "arguments which could decide this issue conclusively.\n",
       "\n"
      ]
     }
@@ -756,7 +736,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Filter Papers by Date (e.g., Last 24 Hours)"
+    "## Filter Papers by Date (e.g., Last 15 Days)"
    ]
   },
   {
@@ -769,15 +749,44 @@
      "output_type": "stream",
      "text": [
       "INFO:floki.document.fetcher.arxiv:Searching for query: all:(agents AND cybersecurity)\n",
-      "INFO:arxiv:Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=all%3A%28agents+AND+cybersecurity%29+AND+submittedDate%3A%5B20241230+TO+20241231%5D&id_list=&sortBy=submittedDate&sortOrder=descending&start=0&max_results=100\n",
-      "INFO:arxiv:Got first page: 1 of 1 total results\n",
-      "INFO:floki.document.fetcher.arxiv:Found 1 results for query: all:(agents AND cybersecurity) AND submittedDate:[20241230 TO 20241231]\n"
+      "INFO:arxiv:Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=all%3A%28agents+AND+cybersecurity%29+AND+submittedDate%3A%5B20241229+TO+20250113%5D&id_list=&sortBy=submittedDate&sortOrder=descending&start=0&max_results=100\n",
+      "INFO:arxiv:Got first page: 2 of 2 total results\n",
+      "INFO:floki.document.fetcher.arxiv:Found 2 results for query: all:(agents AND cybersecurity) AND submittedDate:[20241229 TO 20250113]\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "Title: What is a Social Media Bot? A Global Comparison of Bot and Human Characteristics\n",
+      "Authors: Lynnette Hui Xian Ng, Kathleen M. Carley\n",
+      "Published: 2025-01-01\n",
+      "Summary: Chatter on social media is 20% bots and 80% humans. Chatter by bots and\n",
+      "humans is consistently different: bots tend to use linguistic cues that can be\n",
+      "easily automated while humans use cues that require dialogue understanding.\n",
+      "Bots use words that match the identities they choose to present, while humans\n",
+      "may send messages that are not related to the identities they present. Bots and\n",
+      "humans differ in their communication structure: sampled bots have a star\n",
+      "interaction structure, while sampled humans have a hierarchical structure.\n",
+      "These conclusions are based on a large-scale analysis of social media tweets\n",
+      "across ~200mil users across 7 events. Social media bots took the world by storm\n",
+      "when social-cybersecurity researchers realized that social media users not only\n",
+      "consisted of humans but also of artificial agents called bots. These bots wreck\n",
+      "havoc online by spreading disinformation and manipulating narratives. Most\n",
+      "research on bots are based on special-purposed definitions, mostly predicated\n",
+      "on the event studied. This article first begins by asking, \"What is a bot?\",\n",
+      "and we study the underlying principles of how bots are different from humans.\n",
+      "We develop a first-principle definition of a social media bot. With this\n",
+      "definition as a premise, we systematically compare characteristics between bots\n",
+      "and humans across global events, and reflect on how the software-programmed bot\n",
+      "is an Artificial Intelligent algorithm, and its potential for evolution as\n",
+      "technology advances. Based on our results, we provide recommendations for the\n",
+      "use and regulation of bots. Finally, we discuss open challenges and future\n",
+      "directions: Detect, to systematically identify these automated and potentially\n",
+      "evolving bots; Differentiate, to evaluate the goodness of the bot in terms of\n",
+      "their content postings and relationship interactions; Disrupt, to moderate the\n",
+      "impact of malicious bots.\n",
+      "\n",
       "Title: SecBench: A Comprehensive Multi-Dimensional Benchmarking Dataset for LLMs in Cybersecurity\n",
       "Authors: Pengfei Jing, Mengyun Tang, Xiaorong Shi, Xing Zheng, Sen Nie, Shi Wu, Yong Yang, Xiapu Luo\n",
       "Published: 2024-12-30\n",
@@ -796,8 +805,8 @@
       "high-quality data from open sources and organizing a Cybersecurity Question\n",
       "Design Contest, resulting in 44,823 MCQs and 3,087 SAQs. Particularly, we used\n",
       "the powerful while cost-effective LLMs to (1). label the data and (2).\n",
-      "constructing a grading agent for automatic evaluation of SAQs.Benchmarking\n",
-      "results on 13 SOTA LLMs demonstrate the usability of SecBench, which is\n",
+      "constructing a grading agent for automatic evaluation of SAQs. Benchmarking\n",
+      "results on 16 SOTA LLMs demonstrate the usability of SecBench, which is\n",
       "arguably the largest and most comprehensive benchmark dataset for LLMs in\n",
       "cybersecurity. More information about SecBench can be found at our website, and\n",
       "the dataset can be accessed via the artifact link.\n",
@@ -809,7 +818,7 @@
     "from datetime import datetime, timedelta\n",
     "\n",
     "# Calculate date 48 hours ago\n",
-    "last_24_hours = (datetime.now() - timedelta(days=1)).strftime(\"%Y%m%d\")\n",
+    "last_24_hours = (datetime.now() - timedelta(days=15)).strftime(\"%Y%m%d\")\n",
     "\n",
     "# Search for recent papers\n",
     "recent_results = fetcher.search(\n",
@@ -845,21 +854,24 @@
      "text": [
       "INFO:floki.document.fetcher.arxiv:Searching for query: all:(agents AND cybersecurity)\n",
       "INFO:arxiv:Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=all%3A%28agents+AND+cybersecurity%29&id_list=&sortBy=submittedDate&sortOrder=descending&start=0&max_results=100\n",
-      "INFO:arxiv:Got first page: 93 of 93 total results\n",
+      "INFO:arxiv:Got first page: 94 of 94 total results\n",
       "INFO:floki.document.fetcher.arxiv:Found 5 results for query: all:(agents AND cybersecurity)\n",
-      "INFO:floki.document.fetcher.arxiv:Downloading paper to arxiv_papers/2412.20787v1.SecBench__A_Comprehensive_Multi_Dimensional_Benchmarking_Dataset_for_LLMs_in_Cybersecurity.pdf\n",
+      "INFO:floki.document.fetcher.arxiv:Downloading paper to arxiv_papers/2501.00855v1.What_is_a_Social_Media_Bot__A_Global_Comparison_of_Bot_and_Human_Characteristics.pdf\n",
+      "INFO:floki.document.fetcher.arxiv:Downloading paper to arxiv_papers/2412.20787v3.SecBench__A_Comprehensive_Multi_Dimensional_Benchmarking_Dataset_for_LLMs_in_Cybersecurity.pdf\n",
       "INFO:floki.document.fetcher.arxiv:Downloading paper to arxiv_papers/2412.13420v1.BotSim__LLM_Powered_Malicious_Social_Botnet_Simulation.pdf\n",
       "INFO:floki.document.fetcher.arxiv:Downloading paper to arxiv_papers/2412.15237v1.algoTRIC__Symmetric_and_asymmetric_encryption_algorithms_for_Cryptography____A_comparative_analysis_in_AI_era.pdf\n",
-      "INFO:floki.document.fetcher.arxiv:Downloading paper to arxiv_papers/2412.06512v1.The_Fusion_of_Large_Language_Models_and_Formal_Methods_for_Trustworthy_AI_Agents__A_Roadmap.pdf\n",
-      "INFO:floki.document.fetcher.arxiv:Downloading paper to arxiv_papers/2412.02875v1.Out_of_Distribution_Detection_for_Neurosymbolic_Autonomous_Cyber_Agents.pdf\n"
+      "INFO:floki.document.fetcher.arxiv:Downloading paper to arxiv_papers/2412.06512v1.The_Fusion_of_Large_Language_Models_and_Formal_Methods_for_Trustworthy_AI_Agents__A_Roadmap.pdf\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "Downloaded Paper: What is a Social Media Bot? A Global Comparison of Bot and Human Characteristics\n",
+      "File Path: arxiv_papers/2501.00855v1.What_is_a_Social_Media_Bot__A_Global_Comparison_of_Bot_and_Human_Characteristics.pdf\n",
+      "\n",
       "Downloaded Paper: SecBench: A Comprehensive Multi-Dimensional Benchmarking Dataset for LLMs in Cybersecurity\n",
-      "File Path: arxiv_papers/2412.20787v1.SecBench__A_Comprehensive_Multi_Dimensional_Benchmarking_Dataset_for_LLMs_in_Cybersecurity.pdf\n",
+      "File Path: arxiv_papers/2412.20787v3.SecBench__A_Comprehensive_Multi_Dimensional_Benchmarking_Dataset_for_LLMs_in_Cybersecurity.pdf\n",
       "\n",
       "Downloaded Paper: BotSim: LLM-Powered Malicious Social Botnet Simulation\n",
       "File Path: arxiv_papers/2412.13420v1.BotSim__LLM_Powered_Malicious_Social_Botnet_Simulation.pdf\n",
@@ -869,9 +881,6 @@
       "\n",
       "Downloaded Paper: The Fusion of Large Language Models and Formal Methods for Trustworthy AI Agents: A Roadmap\n",
       "File Path: arxiv_papers/2412.06512v1.The_Fusion_of_Large_Language_Models_and_Formal_Methods_for_Trustworthy_AI_Agents__A_Roadmap.pdf\n",
-      "\n",
-      "Downloaded Paper: Out-of-Distribution Detection for Neurosymbolic Autonomous Cyber Agents\n",
-      "File Path: arxiv_papers/2412.02875v1.Out_of_Distribution_Detection_for_Neurosymbolic_Autonomous_Cyber_Agents.pdf\n",
       "\n"
      ]
     }
@@ -891,6 +900,143 @@
     "    print(f\"File Path: {paper['file_path']}\\n\")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'entry_id': 'http://arxiv.org/abs/2501.00855v1',\n",
+       " 'title': 'What is a Social Media Bot? A Global Comparison of Bot and Human Characteristics',\n",
+       " 'authors': ['Lynnette Hui Xian Ng', 'Kathleen M. Carley'],\n",
+       " 'published': '2025-01-01',\n",
+       " 'updated': '2025-01-01',\n",
+       " 'primary_category': 'cs.CY',\n",
+       " 'categories': ['cs.CY', 'cs.AI', 'cs.SI'],\n",
+       " 'pdf_url': 'http://arxiv.org/pdf/2501.00855v1',\n",
+       " 'file_path': 'arxiv_papers/2501.00855v1.What_is_a_Social_Media_Bot__A_Global_Comparison_of_Bot_and_Human_Characteristics.pdf'}"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "download_results[0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Download Top 5 Papers as PDF Files (Include Summary)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:floki.document.fetcher.arxiv:Searching for query: all:(agents AND cybersecurity)\n",
+      "INFO:arxiv:Requesting page (first: True, try: 0): https://export.arxiv.org/api/query?search_query=all%3A%28agents+AND+cybersecurity%29&id_list=&sortBy=submittedDate&sortOrder=descending&start=0&max_results=100\n",
+      "INFO:arxiv:Got first page: 94 of 94 total results\n",
+      "INFO:floki.document.fetcher.arxiv:Found 5 results for query: all:(agents AND cybersecurity)\n",
+      "INFO:floki.document.fetcher.arxiv:Downloading paper to more_arxiv/2501.00855v1.What_is_a_Social_Media_Bot__A_Global_Comparison_of_Bot_and_Human_Characteristics.pdf\n",
+      "INFO:floki.document.fetcher.arxiv:Downloading paper to more_arxiv/2412.20787v3.SecBench__A_Comprehensive_Multi_Dimensional_Benchmarking_Dataset_for_LLMs_in_Cybersecurity.pdf\n",
+      "INFO:floki.document.fetcher.arxiv:Downloading paper to more_arxiv/2412.13420v1.BotSim__LLM_Powered_Malicious_Social_Botnet_Simulation.pdf\n",
+      "INFO:floki.document.fetcher.arxiv:Downloading paper to more_arxiv/2412.15237v1.algoTRIC__Symmetric_and_asymmetric_encryption_algorithms_for_Cryptography____A_comparative_analysis_in_AI_era.pdf\n",
+      "INFO:floki.document.fetcher.arxiv:Downloading paper to more_arxiv/2412.06512v1.The_Fusion_of_Large_Language_Models_and_Formal_Methods_for_Trustworthy_AI_Agents__A_Roadmap.pdf\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "from pathlib import Path\n",
+    "\n",
+    "# Create a directory for downloaded papers\n",
+    "os.makedirs(\"arxiv_papers\", exist_ok=True)\n",
+    "\n",
+    "# Search and download PDFs\n",
+    "download_results = fetcher.search(query=\"all:(agents AND cybersecurity)\", max_results=5, download=True, dirpath=Path(\"more_arxiv\"), include_summary=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'entry_id': 'http://arxiv.org/abs/2501.00855v1',\n",
+       " 'title': 'What is a Social Media Bot? A Global Comparison of Bot and Human Characteristics',\n",
+       " 'authors': ['Lynnette Hui Xian Ng', 'Kathleen M. Carley'],\n",
+       " 'published': '2025-01-01',\n",
+       " 'updated': '2025-01-01',\n",
+       " 'primary_category': 'cs.CY',\n",
+       " 'categories': ['cs.CY', 'cs.AI', 'cs.SI'],\n",
+       " 'pdf_url': 'http://arxiv.org/pdf/2501.00855v1',\n",
+       " 'file_path': 'more_arxiv/2501.00855v1.What_is_a_Social_Media_Bot__A_Global_Comparison_of_Bot_and_Human_Characteristics.pdf',\n",
+       " 'summary': 'Chatter on social media is 20% bots and 80% humans. Chatter by bots and\\nhumans is consistently different: bots tend to use linguistic cues that can be\\neasily automated while humans use cues that require dialogue understanding.\\nBots use words that match the identities they choose to present, while humans\\nmay send messages that are not related to the identities they present. Bots and\\nhumans differ in their communication structure: sampled bots have a star\\ninteraction structure, while sampled humans have a hierarchical structure.\\nThese conclusions are based on a large-scale analysis of social media tweets\\nacross ~200mil users across 7 events. Social media bots took the world by storm\\nwhen social-cybersecurity researchers realized that social media users not only\\nconsisted of humans but also of artificial agents called bots. These bots wreck\\nhavoc online by spreading disinformation and manipulating narratives. Most\\nresearch on bots are based on special-purposed definitions, mostly predicated\\non the event studied. This article first begins by asking, \"What is a bot?\",\\nand we study the underlying principles of how bots are different from humans.\\nWe develop a first-principle definition of a social media bot. With this\\ndefinition as a premise, we systematically compare characteristics between bots\\nand humans across global events, and reflect on how the software-programmed bot\\nis an Artificial Intelligent algorithm, and its potential for evolution as\\ntechnology advances. Based on our results, we provide recommendations for the\\nuse and regulation of bots. Finally, we discuss open challenges and future\\ndirections: Detect, to systematically identify these automated and potentially\\nevolving bots; Differentiate, to evaluate the goodness of the bot in terms of\\ntheir content postings and relationship interactions; Disrupt, to moderate the\\nimpact of malicious bots.'}"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "download_results[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Chatter on social media is 20% bots and 80% humans. Chatter by bots and\n",
+      "humans is consistently different: bots tend to use linguistic cues that can be\n",
+      "easily automated while humans use cues that require dialogue understanding.\n",
+      "Bots use words that match the identities they choose to present, while humans\n",
+      "may send messages that are not related to the identities they present. Bots and\n",
+      "humans differ in their communication structure: sampled bots have a star\n",
+      "interaction structure, while sampled humans have a hierarchical structure.\n",
+      "These conclusions are based on a large-scale analysis of social media tweets\n",
+      "across ~200mil users across 7 events. Social media bots took the world by storm\n",
+      "when social-cybersecurity researchers realized that social media users not only\n",
+      "consisted of humans but also of artificial agents called bots. These bots wreck\n",
+      "havoc online by spreading disinformation and manipulating narratives. Most\n",
+      "research on bots are based on special-purposed definitions, mostly predicated\n",
+      "on the event studied. This article first begins by asking, \"What is a bot?\",\n",
+      "and we study the underlying principles of how bots are different from humans.\n",
+      "We develop a first-principle definition of a social media bot. With this\n",
+      "definition as a premise, we systematically compare characteristics between bots\n",
+      "and humans across global events, and reflect on how the software-programmed bot\n",
+      "is an Artificial Intelligent algorithm, and its potential for evolution as\n",
+      "technology advances. Based on our results, we provide recommendations for the\n",
+      "use and regulation of bots. Finally, we discuss open challenges and future\n",
+      "directions: Detect, to systematically identify these automated and potentially\n",
+      "evolving bots; Differentiate, to evaluate the goodness of the bot in terms of\n",
+      "their content postings and relationship interactions; Disrupt, to moderate the\n",
+      "impact of malicious bots.\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(download_results[0][\"summary\"])"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -919,7 +1065,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [
     {
@@ -947,7 +1093,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Extracted 83 documents from the PDFs.\n"
+      "Extracted 107 documents from the PDFs.\n"
      ]
     }
    ],
@@ -959,6 +1105,10 @@
     "docs_read = []\n",
     "reader = PyPDFReader()\n",
     "\n",
+    "# Remove 'summary' from metadata in download_results\n",
+    "for paper in download_results:\n",
+    "    paper.pop(\"summary\", None)  # Remove the 'summary' key if it exists\n",
+    "\n",
     "# Process each downloaded PDF\n",
     "for paper in download_results:\n",
     "    local_pdf_path = Path(paper[\"file_path\"])  # Ensure the key matches the output\n",
@@ -973,30 +1123,30 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "[Document(metadata={'file_path': 'arxiv_papers/2412.20787v1.SecBench__A_Comprehensive_Multi_Dimensional_Benchmarking_Dataset_for_LLMs_in_Cybersecurity.pdf', 'page_number': 1, 'total_pages': 11, 'entry_id': 'http://arxiv.org/abs/2412.20787v1', 'title': 'SecBench: A Comprehensive Multi-Dimensional Benchmarking Dataset for LLMs in Cybersecurity', 'authors': ['Pengfei Jing', 'Mengyun Tang', 'Xiaorong Shi', 'Xing Zheng', 'Sen Nie', 'Shi Wu', 'Yong Yang', 'Xiapu Luo'], 'published': '2024-12-30', 'updated': '2024-12-30', 'primary_category': 'cs.CR', 'categories': ['cs.CR', 'cs.AI'], 'pdf_url': 'http://arxiv.org/pdf/2412.20787v1'}, text='SecBench: A Comprehensive Multi-Dimensional\\nBenchmarking Dataset for LLMs in Cybersecurity\\nPENGFEI JING, The Hong Kong Polytechnic University, Tencent Security Keen Lab, China\\nMENGYUN TANG, Tencent Zhuque Lab, China\\nXIAORONG SHI, Tencent Zhuque Lab, China\\nXING ZHENG, Tencent Zhuque Lab, China\\nSEN NIE, Tencent Security Keen Lab, China\\nSHI WU, Tencent Security Keen Lab, China\\nYONG YANG,Tencent Security Platform and Department, China\\nXIAPU LUO, The Hong Kong Polytechnic University, China\\nEvaluating Large Language Models (LLMs) is crucial for understanding their capabilities and limitations across\\nvarious applications, including natural language processing and code generation. Existing benchmarks like\\nMMLU, C-Eval, and HumanEval assess general LLM performance but lack focus on specific expert domains\\nsuch as cybersecurity. Previous attempts to create cybersecurity datasets have faced limitations, including\\ninsufficient data volume and a reliance on multiple-choice questions (MCQs). To address these gaps, we\\npropose SecBench, a multi-dimensional benchmarking dataset designed to evaluate LLMs in the cybersecurity\\ndomain. SecBench includes questions in various formats (MCQs and short-answer questions (SAQs)), at\\ndifferent capability levels (Knowledge Retention and Logical Reasoning), in multiple languages (Chinese and\\nEnglish), and across various sub-domains. The dataset was constructed by collecting high-quality data from\\nopen sources and organizing a Cybersecurity Question Design Contest, resulting in 44,823 MCQs and 3,087\\nSAQs. Particularly, we used the powerful while cost-effective LLMs to (1). label the data and (2). constructing\\na grading agent for automatic evaluation of SAQs. Benchmarking results on 13 SOTA LLMs demonstrate the\\nusability of SecBench, which is arguably the largest and most comprehensive benchmark dataset for LLMs in\\ncybersecurity. More information about SecBench can be found at our website [13], and the dataset can be\\naccessed via the artifact link [12].\\n1 Introduction\\nEvaluating Large Language Models (LLMs) is essential for understanding their capabilities and\\nlimitations, as these models play a significant role in various applications, from natural language\\nprocessing to code generation. The importance of evaluating LLMs lies in ensuring their reliable\\nand effective performance across diverse tasks while identifying areas for improvement. Many\\nbenchmarks have been developed to assess different aspects of LLM performance, such as the\\nMMLU benchmark for general knowledge and reasoning [ 20], C-Eval for the Chinese context\\n[16], and HumanEval for code generation and completion [ 17]. These benchmarks collectively\\nprovide a comprehensive framework for evaluating the multifaceted capabilities of LLMs. However,\\nwhile these benchmarks focus on general capabilities, it is also crucial to evaluate LLMs in specific\\nexpert domains, such as cybersecurity. Previous studies have attempted to establish datasets for this\\npurpose [15, 18, 19, 21], but they face limitations, including insufficient evaluation data volume and\\na predominant use of multiple-choice questions (MCQs). A more challenging task, the short-answer\\nquestion (SAQ), which requires the model to generate its own answer, has not been explored in\\nthese studies.\\nAuthors’ Contact Information: Pengfei Jing, The Hong Kong Polytechnic University, Tencent Security Keen Lab, China;\\nMengyun Tang, Tencent Zhuque Lab, China; Xiaorong Shi, Tencent Zhuque Lab, China; Xing Zheng, Tencent Zhuque Lab,\\nChina; Sen Nie, Tencent Security Keen Lab, China; Shi Wu, Tencent Security Keen Lab, China; Yong Yang, Tencent Security\\nPlatform and Department, China; Xiapu Luo, The Hong Kong Polytechnic University, China.\\narXiv:2412.20787v1  [cs.CR]  30 Dec 2024'),\n",
-       " Document(metadata={'file_path': 'arxiv_papers/2412.20787v1.SecBench__A_Comprehensive_Multi_Dimensional_Benchmarking_Dataset_for_LLMs_in_Cybersecurity.pdf', 'page_number': 2, 'total_pages': 11, 'entry_id': 'http://arxiv.org/abs/2412.20787v1', 'title': 'SecBench: A Comprehensive Multi-Dimensional Benchmarking Dataset for LLMs in Cybersecurity', 'authors': ['Pengfei Jing', 'Mengyun Tang', 'Xiaorong Shi', 'Xing Zheng', 'Sen Nie', 'Shi Wu', 'Yong Yang', 'Xiapu Luo'], 'published': '2024-12-30', 'updated': '2024-12-30', 'primary_category': 'cs.CR', 'categories': ['cs.CR', 'cs.AI'], 'pdf_url': 'http://arxiv.org/pdf/2412.20787v1'}, text='2 Pengfei Jing, Mengyun Tang, Xiaorong Shi, Xing Zheng, Sen Nie, Shi Wu, Yong Yang, and Xiapu Luo\\nTowards constructing a more comprehensive dataset and benchmarking large language models\\n(LLMs) in cybersecurity, we propose SecBench, a multi-dimensional benchmarking dataset designed\\nto evaluate LLMs in the cybersecurity domain. Specifically, SecBench assesses LLMs with questions\\nin various formats (multiple-choice questions (MCQs) and short-answer questions (SAQs)), at\\ndifferent capability levels (Knowledge Retention (KR) and Logical Reasoning (LR)), in multiple\\nlanguages (Chinese and English), and across various sub-domains, thereby covering a wide range\\nof knowledge in cybersecurity. To construct such an extensive dataset, we began by collecting\\nhigh-quality data from open sources, resulting in an initial dataset of 10,551 MCQs. To further\\nexpand this dataset both qualitatively and quantitatively, we organized a Cybersecurity Question\\nDesign Contest aimed at collecting high-quality questions from the public. After filtering and\\nprocessing the data collected from the contest, we obtained an additional 34,272 MCQs and 3,087\\nSAQs. Additionally, we utilized a powerful LLM, GPT-4, to automatically label the collected data\\naccording to their most relevant capability level and domain. Following the labeling process, we\\nderived SecBench, which consists of 44,823 MCQs and 3,087 SAQs, each well-labeled with detailed\\nmetadata.\\nTo achieve automatic evaluation of SAQs, we employed another powerful yet cost-efficient\\nLLM, GPT-4o-mini, as a grading agent to automatically grade the tested LLMs’ answers based on\\nthe question stem and ground truth (correct answer) provided by SecBench. For evaluation, we\\nconducted benchmarking of 13 State-of-the-Art LLMs on SecBench, demonstrating the usability of\\nSecBench both qualitatively and quantitatively.\\nThe remainder of this paper is structured as follows. §2 provides the necessary background\\ninformation. We introduce the design of SecBench in §3 and detail the dataset construction process\\nin §4. Then we present the benchmarking results in §5, discussion in §6, and conclude with §7.\\nMore information about SecBench can be found at our website [13], and the dataset can be accessed\\nvia the artifact link [12].\\n2 Background\\nBenchmarking LLMs.Evaluating Large Language Models (LLMs) is crucial for understanding\\ntheir capabilities and limitations, as these models have become increasingly influential in various\\napplications, from natural language processing to code generation and beyond. The significance\\nof evaluating LLMs lies in ensuring that they perform reliably and effectively in diverse tasks,\\nwhile also identifying areas for improvement. Several popular benchmarks have been developed\\nto assess different aspects of LLM performance. For instance, the MMLU benchmark evaluates\\ngeneral knowledge and reasoning across a wide range of subjects [20]. C-Eval [16] focuses on LLM’s\\ncapability in the specific Chinese context. HumanEval [17] is designed to assess code generation\\nand completion tasks. These benchmarks collectively provide a comprehensive framework for\\nevaluating the multifaceted capabilities of LLMs.\\nBenchmarking LLM in Cybersecurity.The benchmarks discussed earlier primarily focus on\\nassessing the general capabilities of LLMs. However, since LLMs are often fine-tuned for specific\\nexpert domains, it is crucial to evaluate their performance across various specialized fields. In the\\ncontext of cybersecurity, previous studies have attempted to establish datasets to assess LLMs’\\nknowledge in this particular domain [15, 18, 19, 21]. Unfortunately, these studies face two main\\nlimitations. First, the average volume of evaluation data is only at the thousand-level, which may\\nnot be sufficient to provide a comprehensive benchmark. Second, the question design in previous\\nworks predominantly follows the multiple-choice question (MCQ) format, which merely requires\\nthe model to select the correct answer from given options. However, a more challenging task, the\\nshort-answer question (SAQ), which requires the model to generate its own answer rather than\\nchoosing from existing ones, has not been explored in previous studies.'),\n",
-       " Document(metadata={'file_path': 'arxiv_papers/2412.20787v1.SecBench__A_Comprehensive_Multi_Dimensional_Benchmarking_Dataset_for_LLMs_in_Cybersecurity.pdf', 'page_number': 3, 'total_pages': 11, 'entry_id': 'http://arxiv.org/abs/2412.20787v1', 'title': 'SecBench: A Comprehensive Multi-Dimensional Benchmarking Dataset for LLMs in Cybersecurity', 'authors': ['Pengfei Jing', 'Mengyun Tang', 'Xiaorong Shi', 'Xing Zheng', 'Sen Nie', 'Shi Wu', 'Yong Yang', 'Xiapu Luo'], 'published': '2024-12-30', 'updated': '2024-12-30', 'primary_category': 'cs.CR', 'categories': ['cs.CR', 'cs.AI'], 'pdf_url': 'http://arxiv.org/pdf/2412.20787v1'}, text='SecBench: A Comprehensive Multi-Dimensional Benchmarking Dataset for LLMs in Cybersecurity 3\\nMulti-Language SecBench\\nDataset\\nChinese\\n English\\nMulti-Level\\nKnowledge\\nRetention\\nLogical\\nReasoning\\nMulti-Form\\nMultiple Choice\\nQuestion - MCQ\\nShort Answer\\nQuestion - SAQ\\nMulti-\\nDomain\\nD1. Security\\nManagement\\nD3. Network and\\nInfrastructure Security\\nD2. Data\\nSecurity\\nD4. Security Standards\\nand Regulations\\nD5. Application\\nSecurity\\nD6. Identity and\\nAccess Control\\nD7. Fundamental Software and\\nHardware Technology\\nD8. Endpoint and\\nHost Security\\nD9. Cloud\\nSecurity\\nFig. 1. SecBench: A multi-level, multi-language, multi-form, and multi-domain benchmarking dataset for\\nLLM in Cybersecurity.\\n3 SecBench Design\\nFig.1 shows the overview of the SecBench design: it is a comprehensive benchmarking dataset\\naiming to benchmark LLM’s capability in cybersecurity from Multi-Level, Multi-Language, Multi-\\nForm, Multi-Domain.\\nMulti-Level. We devise the capability of LLM in cybersecurity into two different levels: Knowl-\\nedge Retention - KR and Logical Reasoning - LR . Among the two, knowledge retention examines the\\nLLM’s ability to retain existing knowledge. The content of such questions is relatively straightfor-\\nward and does not involve complex reasoning. On the other hand, logical reasoning assesses the\\nLLM’s ability to infer the correct answer based on the given information. The difficulty of these\\nquestions is relatively higher and better demonstrates the model’s capability to handle complex\\nproblems.\\nMulti-Language. SecBench includes questions of two mainstream languages - Chinese and\\nEnglish, to present a more comprehensive benchmark.\\nMulti-Form. Unlike previous works that constructed only multiple-choice questions (MCQs) [15,\\n18, 19, 21], SecBench also includes short-answer questions (SAQs) to present a more comprehensive\\nevaluation. This is because SAQs tend to be more challenging than MCQs: for MCQs, the LLM\\nonly needs to choose the correct answer(s) from the given options, while for SAQs, the LLM is\\nprompted to construct its own answer based on the given question. As a result, SAQs can evaluate\\nthe capability of the LLM at a higher level, especially considering the inherent limitations of LLMs\\n(e.g., hallucinations and repetition).\\nMulti-Domain. The questions in SecBench consist of 9 different domains, includingD1. Security\\nManagement, D2. Data Security , D3. Network and Infrastructure Security , D4. Security Standards and\\nRegulations , D5. Application Security , D6. Identity and Access Control , D7. Fundamental Software\\nand Hardware and Technology , D8. Endpoint and Host Security , D9. Cloud Security . Particularly,\\nthe above domains were devised from several rounds of brainstorming and revision, which were\\nexpected to cover most (if not all) related sub-domains in cybersecurity. Note that we do not expect\\nthese domains to be orthogonal, and it is possible that one question can be reasonably labeled into\\ndifferent domains. In our dataset, one question is assigned only one most-related domain label from\\nD1 to D9.\\nExample. For each line of data, it is either an MCQ or SAQ, provided with the question stem\\nand corresponding answer, labeled with language (Chinese or English), level (Knowledge Retention\\nor Logical Reasoning) and domain (from D1 to D9).\\nFollowing is one MCQ example, labeled in the domain of Security Management and the level of\\nLogical Reasoning . For MCQs, A blank is left in question stem, and there are four choices given in\\nanswers for the tested LLM to select, with label referring to the correct choice(s) among the four.'),\n",
-       " Document(metadata={'file_path': 'arxiv_papers/2412.20787v1.SecBench__A_Comprehensive_Multi_Dimensional_Benchmarking_Dataset_for_LLMs_in_Cybersecurity.pdf', 'page_number': 4, 'total_pages': 11, 'entry_id': 'http://arxiv.org/abs/2412.20787v1', 'title': 'SecBench: A Comprehensive Multi-Dimensional Benchmarking Dataset for LLMs in Cybersecurity', 'authors': ['Pengfei Jing', 'Mengyun Tang', 'Xiaorong Shi', 'Xing Zheng', 'Sen Nie', 'Shi Wu', 'Yong Yang', 'Xiapu Luo'], 'published': '2024-12-30', 'updated': '2024-12-30', 'primary_category': 'cs.CR', 'categories': ['cs.CR', 'cs.AI'], 'pdf_url': 'http://arxiv.org/pdf/2412.20787v1'}, text='4 Pengfei Jing, Mengyun Tang, Xiaorong Shi, Xing Zheng, Sen Nie, Shi Wu, Yong Yang, and Xiapu Luo\\nRaw Materials\\ncleaning &\\nextraction\\nMCQ stems &\\nAnswers\\nLabeled\\nQuestions\\nLLM Labelling\\nInitial Dataset: 10,551 MCQs\\nBenchmarking\\nLLMs\\nCybersecurity\\nQuestion Design\\nContest Large-Scale\\nCollected Data\\nData Quality\\nEvaluation\\n...\\nHigh-Quality Data\\nCandidates\\nLLM-based Cleaning &\\nLabelling\\nLarge-Scale Dataset Construction\\nInitial Dataset Construction\\nContest Dataset: 34,272 MCQs; 3,087 SAQs\\nSecBench\\n44,823 MCQs\\n3,087 SAQs\\nFig. 2. SecBench: Dataset Construction.\\n{\"question\":\"In an information security risk management activity of a unit, the risk assessment report\\nsuggested that there were high-risk vulnerabilities in the FTP service of Server A. Subsequently, the unit chose\\nthe treatment measure of shutting down the FTP service in risk treatment, so may I ask to which type of\\nrisk treatment this measure belongs to ()\", \" answers\":[\"Risk reduction\", \"Risk avoidance\", \"Risk transfer\", \"Risk\\nacceptance\"], \"label\":\"B\", \"language\":\"English\", \"domain\":\"Security Management\", \"level\":\"Logical Reasoning\"}\\nFollowing is one SAQ example, labeled in the domain ofData Security and the level of Knowledge\\nRetention. For SAQs, there is no choice given for selection, and the tested LLM is expected to\\nconstruct the answer from scratch. in SAQ, answer refers to the correct answer of the question\\nstem, which will be used to evaluate LLM’s output.\\n{\"question\":\"How does email encryption contribute to regulatory compliance and data protection efforts,\\nand what are some common encryption methods used to secure email communications?\", \" answer\":\"Email\\nencryption helps organizations comply with data protection regulations such as GDPR, HIPAA, and CCPA\\nby safeguarding sensitive information transmitted via email and preventing unauthorized access or disclo-\\nsure. Common encryption methods include symmetric encryption, asymmetric encryption, and end-to-end\\nencryption, each offering varying levels of security and usability. \", \" language\":\"English\", \"domain\":\"Data\\nSecurity\", \"level\":\"Knowledge Retention\"}\\n4 Dataset Construction Process\\n4.1 Initial Dataset Construction\\nQuestion Stems and Answers Extraction.We aim to construct a small batch of datasets to\\nvalidate the rationality of the SecBench framework. To achieve this goal, we first collect raw\\nmaterials from various sources that can be used to extract high-quality question data, including\\nreal exam questions from various cybersecurity fields, authoritative books, and so on. Starting\\nfrom these raw materials, we perform automated extraction of questions and answer data from\\nthese resources (for example, using regular expressions). After this step, we have collected a total\\nof 10,551 high-quality MCQs, covering different domains.\\nLLM-based Labeling.However, the dataset obtained in the previous step only contains question\\nstems and answers, lacking the corresponding labels. Therefore, we used the powerful large\\nlanguage model - GPT4 [ 4], to further label this part of the data. With our carefully designed\\nprompts, we utilized a powerful large model (GPT-4) to label these data, including tagging the\\ndifficulty level of the questions (whether it is Knowledge Retention or Logical Reasoning) and\\ntagging the specific domain that the questions assess (as mentioned earlier, from D1 to D9). After'),\n",
-       " Document(metadata={'file_path': 'arxiv_papers/2412.20787v1.SecBench__A_Comprehensive_Multi_Dimensional_Benchmarking_Dataset_for_LLMs_in_Cybersecurity.pdf', 'page_number': 5, 'total_pages': 11, 'entry_id': 'http://arxiv.org/abs/2412.20787v1', 'title': 'SecBench: A Comprehensive Multi-Dimensional Benchmarking Dataset for LLMs in Cybersecurity', 'authors': ['Pengfei Jing', 'Mengyun Tang', 'Xiaorong Shi', 'Xing Zheng', 'Sen Nie', 'Shi Wu', 'Yong Yang', 'Xiapu Luo'], 'published': '2024-12-30', 'updated': '2024-12-30', 'primary_category': 'cs.CR', 'categories': ['cs.CR', 'cs.AI'], 'pdf_url': 'http://arxiv.org/pdf/2412.20787v1'}, text='SecBench: A Comprehensive Multi-Dimensional Benchmarking Dataset for LLMs in Cybersecurity 5\\nLogic Reasoning:\\n9.2%\\nKnowledge\\nRetention: 90.8%\\n Chinese: 80.4%\\nEnglish: 19.6%\\nD7. Fundamental\\nSoftware and Hardware\\nTechnology: 2.1%\\nD4. Security Standards\\nand Regulations: 14.4%\\nD9. Cloud\\nSecurity: 4.8%\\nD6. Identity and\\nAccess Control: 5.5%\\nD8. Endpoint and\\nHost Security: 14.0%\\nD1. Security\\nManagement: 17.0%\\nD5. Application \\nSecurity: 15.9%\\nD2. Data Security:\\n11.5%\\nD3. Network and\\nInfrastructure\\nSecurity: 14.9%\\n(a). Distribution of evaluation\\nlevels: KR and LR\\n (b). Distribution of domains: D1 to D9.\\n (c). Distribution of language\\nFig. 3. The distribution of evaluation level, domain and language of the 44,823 MCQs.\\nthis step, we successfully labeled all collected questions. These questions became the prototype\\nfor SecBench, and we used this part of the labeled high-quality data to preliminarily validate the\\nrationality of the SecBench design.\\n4.2 Large-Scale Dataset Construction\\nCybersecurity Question Design Contest.To further expand the SecBench dataset, we have\\norganized a large-scale Cybersecurity Question Design Contest [13]. In this contest, we expected\\nparticipants to submit high-quality evaluation data across multiple domains, which we would\\nsubsequently clean and incorporate into the SecBench database. Specifically, we categorized the\\ndata submitted in question into three quality levels, with different weight scores assigned to each\\nlevel to encourage participants to submit high-quality data:\\n- Qualified Quality: The question meets the submission format, contains no factual errors,\\nand is not duplicated with other questions submitted by the same or other participants.\\n- Medium Quality: The question has clear logic and expression, a well-defined domain, an\\nunambiguous answer, and provides a clear and reasonable explanation along with a verifiable\\nsource.\\n- High Quality: The question design should have breadth, depth, and challenge, thus providing\\na high degree of differentiation for the capabilities of different models.\\nLarge-Scale Data Cleaning and Labeling.With the huge amount of data collected from the\\ncontest, we first manually assign the quality level (qualified, medium or high, as stated above) to\\neach submission. This process is performed by experienced experts with enough years of work\\nexperience in the cybersecurity domain, ensuring the justification of the quality attribution process.\\nThen, a rule-based filtering of these high-quality questions was performed to rule out possible\\nduplications or incomplete data that were missed by the former human-based quality attribution.\\nFinally, similar to Sec.4.1, we labeled the questions by their level (KR or LR), language and domain\\nwith the help of LLM. After the whole process, we obtained a total of 34,277 MCQs and 3,087 SAQs,\\ngreatly expanding our initial dataset quantitatively (more MCQs) and qualitatively (introducing\\nthe new evaluation form - SAQs).\\n4.3 Dataset Distribution\\nMCQ. Fig.3 shows the data distribution of the 44,823 MCQs in SecBench. According to Fig.3(a), the\\nmajority of the MCQs fall into the KR category (90.8%), which is reasonable because MCQs tend to\\nhave short question stems and focus on testing the knowledge base of the LLM. Notably, 9.2% of\\nthe MCQs are of the LR type, requiring the LLM to perform reasoning to obtain the correct answer.\\nAs shown in Fig.3(b), the data distribution across the 9 domains is generally even, with D6, D7, and'),\n",
-       " Document(metadata={'file_path': 'arxiv_papers/2412.20787v1.SecBench__A_Comprehensive_Multi_Dimensional_Benchmarking_Dataset_for_LLMs_in_Cybersecurity.pdf', 'page_number': 6, 'total_pages': 11, 'entry_id': 'http://arxiv.org/abs/2412.20787v1', 'title': 'SecBench: A Comprehensive Multi-Dimensional Benchmarking Dataset for LLMs in Cybersecurity', 'authors': ['Pengfei Jing', 'Mengyun Tang', 'Xiaorong Shi', 'Xing Zheng', 'Sen Nie', 'Shi Wu', 'Yong Yang', 'Xiapu Luo'], 'published': '2024-12-30', 'updated': '2024-12-30', 'primary_category': 'cs.CR', 'categories': ['cs.CR', 'cs.AI'], 'pdf_url': 'http://arxiv.org/pdf/2412.20787v1'}, text='6 Pengfei Jing, Mengyun Tang, Xiaorong Shi, Xing Zheng, Sen Nie, Shi Wu, Yong Yang, and Xiapu Luo\\nLogic Reasoning:\\n63.4%\\nKnowledge\\nRetention: 36.6%\\nChinese: 97.4%\\nEnglish: 2.6%\\nD7. Fundamental Software\\nand Hardware Technology: 1.0%\\nD4. Security Standards\\nand Regulations: 4.7%\\nD9. Cloud\\nSecurity: 3.6%\\nD6. Identity and\\nAccess Control: 4.4%\\nD8. Endpoint and\\nHost Security: 5.6%\\nD1. Security\\nManagement: 19.0%\\nD5. Application \\nSecurity: 29.5%\\nD2. Data Security:\\n12.7%\\nD3. Network and\\nInfrastructure\\nSecurity: 19.5%\\n(a). Distribution of evaluation\\nlevels: KR and LR\\n (b). Distribution of domains: D1 to D9.\\n (c). Distribution of language\\nFig. 4. The distribution of evaluation level, domain and language of the 3,087 SAQs.\\nD9 having relatively less data (5.5%, 2.1%, and 4.8%, respectively). It is important to note that, given\\nthe large size of the dataset (44,823 MCQs), even a 4.8% share corresponds to over 2,000 MCQs.\\nAccording to Fig.3(c), the majority (80.4%) of the MCQs are in Chinese, reflecting the context of\\nthe cybersecurity question design contest. Additionally, the 19.6% of English MCQs (nearly 9,000\\nquestions) provide a sufficient dataset for evaluating the LLM’s cybersecurity capabilities in an\\nEnglish context.\\nSAQ. Fig.4 shows the distribution of the evaluation level, domain, and language of the 3,087\\nSAQs. As indicated in Fig.4(a), 36.6% of the SAQs are designed to assess knowledge retention, while\\n63.4% are aimed at evaluating logic reasoning, indicating that the majority of SAQs are challenging\\nand require the LLM to perform reasoning. According to Fig.4(b), the domains D1, D2, D3, and D5\\nconstitute the majority of the assessed domains. Notably, even D7, which comprises only 1.0% of\\nthe SAQs, includes 32 high-quality questions, given the overall dataset size of 3,087. As shown in\\nFig.4(c), 97.4% of the SAQs are in Chinese, reflecting the context in which the contest was held,\\nresulting in questions primarily constructed in Chinese.\\n4.4 Benchmarking Process\\nMCQ. The evaluation of MCQ is rather intuitive: for each MCQ, we check whether the model’s\\noutput (i.e., model’s choice(s) among A, B, C, and D) is the same as the correct answer. For MCQs\\nthat involve more than one correct answer, model’s output is judged as correct only when it is\\nidentical to the correct answer, meaning that no points are awarded for incorrect or incomplete\\nselections. Particularly, the evaluation of MCQ is implemented on the widely-used open-sourced\\nevaluation framework - OpenCompass [10].\\nSAQ.Grading an SAQ is not as intuitive as grading MCQ, in which case we only need to determine\\nwhether the LLM’s choice is the correct one(s). Particularly, grading SAQs requires to understand\\nboth the question stem and the model prediction (answer), and then fairly grade this prediction\\nbased on the ground truth, which is expected to huge amount of manual effort. In our work, we\\nintroduce a Grading Agent to realize the automatic grading of SAQs, and Fig.5 shows the process\\nof how the SAQs were evaluation on tested LLMs. Specifically, each SAQ includes the question\\nstem and the ground truth (i.e., correct answer) of the question. The question stem is first fed\\ninto the tested LLMs to generate the Model Prediction , which is the LLMs’ answer waiting to be\\ngraded. Then, the three parts of data, including the question stem, ground truth, and the model\\nprediction will be given to the Grading Agent, which is a sufficiently powerful LLM to grade the\\nModel Prediction based on the ground truth, and output the corresponding scores. Specifically, this\\nGrading Agent should 1). be capable of fairly grading the model prediction, and 2). generate stable\\noutput (e.g., a final grading score for further processing) In our work, we choose the GPT-4o mini'),\n",
-       " Document(metadata={'file_path': 'arxiv_papers/2412.20787v1.SecBench__A_Comprehensive_Multi_Dimensional_Benchmarking_Dataset_for_LLMs_in_Cybersecurity.pdf', 'page_number': 7, 'total_pages': 11, 'entry_id': 'http://arxiv.org/abs/2412.20787v1', 'title': 'SecBench: A Comprehensive Multi-Dimensional Benchmarking Dataset for LLMs in Cybersecurity', 'authors': ['Pengfei Jing', 'Mengyun Tang', 'Xiaorong Shi', 'Xing Zheng', 'Sen Nie', 'Shi Wu', 'Yong Yang', 'Xiapu Luo'], 'published': '2024-12-30', 'updated': '2024-12-30', 'primary_category': 'cs.CR', 'categories': ['cs.CR', 'cs.AI'], 'pdf_url': 'http://arxiv.org/pdf/2412.20787v1'}, text='SecBench: A Comprehensive Multi-Dimensional Benchmarking Dataset for LLMs in Cybersecurity 7\\nSecBench\\nSAQs\\nQuestion\\nStem\\nGround\\nTruth\\n...\\nModel\\nPrediction\\nPrompt:\\nGrade Model Prediction, based\\non the Ground Truth ...\\nGrading Agent\\n...\\n7/10\\n8/10\\n6/10\\n...\\nSAQ Scores\\nTested LLMs\\nFig. 5. SAQ evaluation process:A sufficiently powerful LLM is used as the agent to grade the model\\nprediction.\\n[5] as the grading agent, which achieves the balance between the performance (sufficient for the\\nabove two goals) and cost.\\n5 Evaluation\\nBased on SecBench, we conducted extensive benchmarking on 13 SOTA LLMs, including the GPT\\nseries and competitive open-source ones.\\n5.1 MCQ Benchmarking\\nTable 1 presents the benchmarking results for the 44,823 MCQs. The values in each cell represent the\\ncorrectness percentage for the corresponding category. Overall, the correctness of KR is significantly\\nhigher than that of LR, demonstrating the rationale behind our design (i.e., Logical Reasoning\\nis more challenging than Knowledge Retention). The performance of smaller LLMs (with fewer\\nthan 10 billion parameters) is predictably lower than that of larger LLMs (with more than 30\\nbillion parameters) Notably, the Tencent Hunyuan-Turbo model [7] outperforms all existing models,\\nincluding the state-of-the-art GPT-4o and GPT-4o-mini, achieving the highest correctness of 94.28%.\\nIts correctness in Logical Reasoning (93.06%) is also significantly higher than that of other models,\\ndemonstrating Hunyuan-Turbo’s strong capability in solving complex questions in cybersecurity.\\n5.2 SAQ Benchmarking\\nTable 2 presents the benchmarking results for the 3,087 short-answer questions (SAQs). The values\\nin each cell represent the average score, graded by the grading agent, on a percentage scale for\\nthe corresponding category. Compared to MCQs, a larger gap is observed between different LLMs,\\nindicating that solving SAQs is more challenging than MCQs. This is because the tested LLMs are\\nrequired to generate their own answers rather than simply choosing from given options. Notably,\\nfor SAQs, the Tencent Hunyuan-Turbo model [7] also outperforms most existing models, achieving\\na score of 82.13. It ranks second only to the state-of-the-art GPT-4o (85.17) and is competitive with\\nthe GPT-4o-mini (82.49).\\n6 Discussion\\nRationale for Using LLMs in the Process.We utilized GPT-4o [6] for labeling data during the\\nconstruction phase of SecBench, and GPT-4o-mini [5] for grading SAQs in the benchmarking phase.\\nTo ensure that these two LLMs are capable of performing the tasks, we explicitly checked their'),\n",
-       " Document(metadata={'file_path': 'arxiv_papers/2412.20787v1.SecBench__A_Comprehensive_Multi_Dimensional_Benchmarking_Dataset_for_LLMs_in_Cybersecurity.pdf', 'page_number': 8, 'total_pages': 11, 'entry_id': 'http://arxiv.org/abs/2412.20787v1', 'title': 'SecBench: A Comprehensive Multi-Dimensional Benchmarking Dataset for LLMs in Cybersecurity', 'authors': ['Pengfei Jing', 'Mengyun Tang', 'Xiaorong Shi', 'Xing Zheng', 'Sen Nie', 'Shi Wu', 'Yong Yang', 'Xiapu Luo'], 'published': '2024-12-30', 'updated': '2024-12-30', 'primary_category': 'cs.CR', 'categories': ['cs.CR', 'cs.AI'], 'pdf_url': 'http://arxiv.org/pdf/2412.20787v1'}, text='8 Pengfei Jing, Mengyun Tang, Xiaorong Shi, Xing Zheng, Sen Nie, Shi Wu, Yong Yang, and Xiapu Luo\\nTable 1. MCQ Benchmarking:The average correctness (values are percentage numbers) for all 44,823\\nMCQs. Average: Average correctness of all MCQs.Level: KR - Knowledge Retention; LR - Logical Reasoning.\\nLanguage: CH - Chinese; EN - English. Domain: Sub-domains from D1 to D9 in Fig.1.\\nLevel Language Domain\\nAverageKR LR CH EN D1 D2 D3 D4 D5 D6 D7 D8 D9\\nGPT-4o[6] 90.99 91.82 82.75 92.87 83.23 90.32 90.32 88.71 90.51 94.13 89.61 84.43 93.54 90.00\\nGPT-4o-mini[5] 88.79 89.86 78.27 91.37 78.17 88.07 88.30 84.71 88.86 92.84 86.91 75.78 92.90 87.33\\nGPT-3.5-Turbo[3]86.36 87.26 77.43 89.25 74.44 84.71 84.64 82.40 88.34 91.04 81.47 73.84 91.08 84.39\\nHunyuan-Turbo[7]94.28 94.41 93.06 95.58 88.95 94.28 93.85 93.81 93.38 95.77 94.44 93.73 95.51 91.06\\nGLM-4-9B[1] 84.57 85.14 78.95 88.26 69.38 83.26 81.41 80.18 87.11 89.01 80.43 71.35 89.82 83.23\\nLlama-3-8B[8] 77.71 78.43 70.58 80.70 65.43 77.26 74.34 73.07 77.48 82.85 74.51 62.92 84.14 76.74\\nDeepSeek-V2-Lite[2]79.07 80.07 69.22 83.40 61.26 78.21 74.86 73.73 82.48 84.56 71.85 65.51 85.23 76.60\\nQwen2-7B[11] 87.74 88.29 82.29 90.77 75.29 86.94 85.79 85.20 89.38 90.69 83.41 82.49 91.86 83.74\\nYi-1.5-9B[14] 86.44 87.03 80.57 89.04 75.74 85.58 85.61 83.90 87.19 89.74 83.93 80.76 89.63 82.22\\nLlama-3-70B[8] 88.86 89.46 82.97 90.95 80.28 87.27 87.95 85.95 88.54 92.81 87.96 81.95 92.44 87.24\\nQwen2-72B[11] 92.41 92.71 89.50 94.50 83.83 91.90 91.55 91.04 93.01 94.56 90.78 90.05 94.26 89.13\\nYi-1.5-34B[14] 89.59 90.04 85.19 91.48 81.82 89.14 88.71 88.47 90.00 92.20 88.44 87.14 91.38 84.15\\nMixtral-8x7B[9] 86.08 86.78 79.19 88.58 75.76 85.05 84.70 81.52 87.30 91.04 83.13 75.35 89.75 84.39\\nTable 2. SAQ Benchmarking:The average scores graded by the grading agent (converted to a percentage\\nscale). Average: Average correctness of all 3087 SAQs. Level: KR - Knowledge Retention; LR - Logical\\nReasoning. Language: CH - Chinese; EN - English. Domain: Sub-domains from D1 to D9 in Fig.1.\\nLevel Language Domain\\nAverageKR LR CH EN D1 D2 D3 D4 D5 D6 D7 D8 D9\\nGPT-4o[6] 85.17 84.37 85.63 84.95 93.25 85.18 86.91 83.19 84.04 85.14 85.55 83.44 85.78 90.36\\nGPT-4o-mini[5] 82.49 81.17 83.25 82.26 91.12 82.55 84.18 79.52 81.44 82.70 84.31 81.56 84.34 87.12\\nGPT-3.5-Turbo[3]74.78 75.54 74.34 74.49 85.50 74.32 76.15 72.83 72.95 75.09 76.72 77.50 74.22 80.45\\nHunyuan-Turbo[7]82.13 79.64 83.56 81.94 89.00 82.89 84.52 79.77 81.78 81.45 84.38 81.25 83.35 83.96\\nGLM-4-9B[1] 66.26 65.06 66.95 65.91 79.38 67.47 67.32 62.35 67.26 66.21 67.66 66.25 67.17 73.15\\nLlama-3-8B[8] 62.39 60.48 63.50 62.11 73.12 62.77 66.40 58.57 64.11 61.94 61.32 61.56 62.89 69.19\\nDeepSeek-V2-Lite[2]44.84 47.09 43.55 44.55 55.75 43.78 44.44 45.80 34.73 44.77 47.15 49.38 49.02 50.00\\nQwen2-7B[11] 59.99 53.39 63.79 60.14 54.25 63.67 64.71 53.67 62.61 57.75 60.80 55.94 63.78 67.57\\nYi-1.5-9B[14] 65.24 64.98 65.39 65.01 73.88 65.86 67.04 62.61 63.77 64.47 66.57 66.88 66.36 74.23\\nLlama-3-70B[8] 68.12 65.93 69.47 69.48 20.62 71.20 67.35 65.81 70.87 67.64 72.67 71.61 67.31 63.14\\nQwen2-72B[11] 82.13 75.93 85.70 81.96 88.25 84.82 85.45 76.89 86.85 80.52 83.21 80.00 85.14 86.13\\nYi-1.5-34B[14] 75.03 67.82 79.18 74.75 85.38 79.25 78.27 68.05 76.71 73.71 77.37 62.50 79.25 81.80\\nMixtral-8x7B[9] 74.78 72.42 76.15 74.38 89.88 75.06 78.44 71.01 73.42 74.37 77.23 74.38 76.65 80.18\\noutput. Specifically, we randomly sampled a mini-batch from the output for manual verification\\nand validated that (1) GPT-4o can successfully label the questions into the corresponding capability\\nlevel and sub-domain, and (2) GPT-4o-mini can fairly grade the LLMs’ answers based on the ground\\ntruth.\\nLanguage Bias.SecBench exhibits a language bias towards Chinese (80.4% in MCQs and 97.4%\\nin SAQs) because the majority of the data in SecBench comes from the Cybersecurity Question\\nDesign Contest, which is held in a Chinese context. To ensure the originality and best maintain\\nthe original meaning of the questions, we currently do NOT translate the Chinese questions into\\nEnglish or vice versa. As a result, the scale of SecBench could be further doubled via a translation\\nprocess (e.g., via another powerful LLM), which we leave as future work. Additionally, note that\\nconsidering the large scale of SecBench, it still offers a sufficient amount of English questions\\n(nearly 9,000 MCQs and 100 SAQs) for benchmarking.'),\n",
-       " Document(metadata={'file_path': 'arxiv_papers/2412.20787v1.SecBench__A_Comprehensive_Multi_Dimensional_Benchmarking_Dataset_for_LLMs_in_Cybersecurity.pdf', 'page_number': 9, 'total_pages': 11, 'entry_id': 'http://arxiv.org/abs/2412.20787v1', 'title': 'SecBench: A Comprehensive Multi-Dimensional Benchmarking Dataset for LLMs in Cybersecurity', 'authors': ['Pengfei Jing', 'Mengyun Tang', 'Xiaorong Shi', 'Xing Zheng', 'Sen Nie', 'Shi Wu', 'Yong Yang', 'Xiapu Luo'], 'published': '2024-12-30', 'updated': '2024-12-30', 'primary_category': 'cs.CR', 'categories': ['cs.CR', 'cs.AI'], 'pdf_url': 'http://arxiv.org/pdf/2412.20787v1'}, text='SecBench: A Comprehensive Multi-Dimensional Benchmarking Dataset for LLMs in Cybersecurity 9\\n7 Conclusion\\nWe propose SecBench, a multi-dimensional benchmarking dataset specifically designed to evaluate\\nLLMs in the cybersecurity domain. SecBench addresses the limitations of existing benchmarks by\\nincluding questions in various formats (MCQs and SAQs), at different capability levels (Knowledge\\nRetention and Logical Reasoning), in multiple languages (Chinese and English), and across various\\nsub-domains. The dataset was meticulously constructed by collecting high-quality data from open\\nsources and organizing a Cybersecurity Question Design Contest, resulting in 44,823 MCQs and\\n3,087 SAQs. To ensure the quality and consistency of the dataset, we employed GPT-4 for data\\nlabeling and GPT-4o-mini as a grading agent for the automatic evaluation of SAQs. Benchmarking\\nresults demonstrate the usability and comprehensiveness of SecBench, making it arguably the\\nlargest and most detailed benchmark dataset for LLMs in the field of cybersecurity. More information\\nabout SecBench can be found at our website [13], and the dataset can be accessed via the artifact\\nlink [12].\\nReferences\\n[1] 2024. ChatGLM: A Family of Large Language Models from GLM-130B to GLM-4 All Tools. https://arxiv .org/pdf/\\n2406.12793.\\n[2] 2024. DeepSeek-V2: A Strong, Economical, and Efficient Mixture-of-Experts Language Model. https://arxiv .org/abs/\\n2405.04434.\\n[3] 2024. GPT-3.5-Turbo. https://platform .openai.com/docs/models/gpt-3-5-turbo.\\n[4] 2024. GPT-4 API. https://platform .openai.com/docs/models/gp#gpt-4-turbo-and-gpt-4.\\n[5] 2024. GPT-4o mini: advancing cost-efficient intelligence. https://openai .com/index/gpt-4o-mini-advancing-cost-\\nefficient-intelligence/.\\n[6] 2024. GPT-4o: OpenAI’s new flagship model that can reason across audio, vision, and text in real time. https:\\n//openai.com/index/hello-gpt-4o/.\\n[7] 2024. Hunyuan-Large: An Open-Source MoE Model with 52 Billion Activated Parameters by Tencent. https://\\nllm.hunyuan.tencent.com/.\\n[8] 2024. Introducing Meta Llama 3: The most capable openly available LLM to date. https://ai .meta.com/blog/meta-\\nllama-3/.\\n[9] 2024. Mixtral of Experts. https://arxiv .org/pdf/2401.04088.\\n[10] 2024. OpenCompass: An Open-sourced Platform for Evaluation LLMs. https://github .com/open-compass/\\nopencompass.\\n[11] 2024. Qwen2 Technical Report. https://arxiv .org/abs/2407.10671.\\n[12] 2024. SecBench: Artifact. https://zenodo .org/records/14575303.\\n[13] 2024. SecBench: Comprehensively Benchmarking LLMs in Cybersecurity. https://secbench .org/.\\n[14] 2024. Yi: Open Foundation Models by 01.AI. https://github .com/01-ai/Yi-1 .5.\\n[15] Dipkamal Bhusal, Md Tanvirul Alam, Le Nguyen, Ashim Mahara, Zachary Lightcap, Rodney Frazier, Romy Fieblinger,\\nGrace Long Torales, and Nidhi Rastogi. 2024. SECURE: Benchmarking Generative Large Language Models for\\nCybersecurity Advisory. arXiv preprint arXiv:2405.20441 (2024).\\n[16] C-Eval. 2024. C-Eval: A Multi-Level Multi-Discipline Chinese Evaluation Suite for Foundation Models. https:\\n//cevalbenchmark.com/\\n[17] HumanEval. 2024. HumanEval: Hand-Written Evaluation Set. https://github .com/openai/human-eval\\n[18] Zefang Liu. 2023. Secqa: A concise question-answering dataset for evaluating large language models in computer\\nsecurity. arXiv preprint arXiv:2312.15838 (2023).\\n[19] Zefang Liu, Jialei Shi, and John F Buford. 2024. Cyberbench: A multi-task benchmark for evaluating large language\\nmodels in cybersecurity.\\n[20] MMLU. 2024. Measuring Massive Multitask Language Understanding (MMLU). https://github .com/hendrycks/test\\n[21] Norbert Tihanyi, Mohamed Amine Ferrag, Ridhi Jain, Tamas Bisztray, and Merouane Debbah. 2024. CyberMetric: A\\nBenchmark Dataset based on Retrieval-Augmented Generation for Evaluating LLMs in Cybersecurity Knowledge. In\\n2024 IEEE International Conference on Cyber Security and Resilience (CSR) . IEEE, 296–302.'),\n",
-       " Document(metadata={'file_path': 'arxiv_papers/2412.20787v1.SecBench__A_Comprehensive_Multi_Dimensional_Benchmarking_Dataset_for_LLMs_in_Cybersecurity.pdf', 'page_number': 10, 'total_pages': 11, 'entry_id': 'http://arxiv.org/abs/2412.20787v1', 'title': 'SecBench: A Comprehensive Multi-Dimensional Benchmarking Dataset for LLMs in Cybersecurity', 'authors': ['Pengfei Jing', 'Mengyun Tang', 'Xiaorong Shi', 'Xing Zheng', 'Sen Nie', 'Shi Wu', 'Yong Yang', 'Xiapu Luo'], 'published': '2024-12-30', 'updated': '2024-12-30', 'primary_category': 'cs.CR', 'categories': ['cs.CR', 'cs.AI'], 'pdf_url': 'http://arxiv.org/pdf/2412.20787v1'}, text='10 Pengfei Jing, Mengyun Tang, Xiaorong Shi, Xing Zheng, Sen Nie, Shi Wu, Yong Yang, and Xiapu Luo\\nA Detailed Prompts\\nLLM-based labeling.Following is the prompt text that is implemented on GPT-4 for labeling\\nSecBench data. In the prompt, we explicitly detailed the requirement and offered few-shot example\\nto ensure the performance.\\n# Task Description\\nI will upload a question related to information security, and now I need you to help me annotate these questions. I\\nneed you to annotate these questions from two dimensions: 1. Assessing ability: whether the question assesses basic\\nKnowledge Retention or more challenging Logical Reasoning ability. 2. Assessing domain: which specific subfield the\\nquestion belongs to. Next, I will elaborate on these two requirements:\\n1.Assessing ability. First, you will classify each question into one of the following two categories: (a). Knowledge\\nRetention question: This type of question tests whether the test taker has the relevant background knowledge through\\nstraightforward descriptions. The answers to these questions can be obtained directly by querying the knowledge base\\nand do not involve reasoning processes. (b). Logical Reasoning question: This type of question presents the test taker\\nwith a specific scenario and requires the test taker to reason or calculate within that scenario to arrive at the correct\\nanswer. Compared to Knowledge Retention questions, these questions are more challenging.\\n2.Assessing domain. Next, you will annotate the specific domain that each question assesses into one of the following\\n10 categories: (1). Identity and Access Control, (2). Cloud Security, (3). Endpoint and Host Security, (4). Security\\nStandards and Regulations, (5). Data Security, (6). Security Management, (7). Network and Infrastructure Security, (8).\\nFundamental Software and Hardware Technology, (9). Application Security, (10). Others. Note that if you believe a\\nquestion cannot be classified into any of the categories (1) (9), then classify it as (10). Others.\\nFinally, you will provide the reason and basis for your classification of the question.\\n—\\n# Input Introduction\\nThe questions I upload consist of the following format:\\n{\"question\":\"Which of the following is directly related to database security?\", \"answers\":[\"Granularity of access con-\\ntrol\", \"Size of the database\", \"Number of attributes in the relation table\", \"Number of tuples in the relation table\"], \"label\":\"A\"}\\nEach line of the file includes 3 elements: question is the main body of the question, answers are the four provided\\noptions, and label is the correct answer.\\n—\\n# Output Requirements\\nFor each question I upload, you will annotate it according to my requirements and add the annotation results directly\\nto the original data. The annotation results should be in Chinese. You will insert the annotation results after each piece\\nof data, including:\\n- \"assessed ability\": whether it is Knowledge Retention or Logical Reasoning.\\n- \"assessed domain\": which domain the knowledge being assessed belongs to.\\n- \"reason for labeling\": the reason for your annotation, including the reason for the ability annotation and the reason\\nfor the domain annotation, all need to be explained. This explanation must be detailed and explain why you believe the\\nquestion assesses knowledge from a specific domain, not just give a meaningless reason.\\nUsing the example question from the \"Data Introduction\" section, the annotated result should be as follows:\\n{ \"question\": \"Which of the following is directly related to database security?\", \"answers\": [\"Granularity of access\\ncontrol\", \"Size of the database\", \"Number of attributes in the relation table\", \"Number of tuples in the relation table\"],\\n\"label\": \"A\", \"assessed ability\": \"Knowledge Retention\", \"assessed domain\": \"Data Security\", \"reason for labeling\": \"This\\nquestion directly tests specific basic knowledge related to database security and does not involve logical reasoning,\\nso it is labeled as knowledge memory; it can be directly seen from the question stem that this question tests specific\\nknowledge of database security and should be classified under ’Data Security’. \" }\\n—'),\n",
-       " Document(metadata={'file_path': 'arxiv_papers/2412.20787v1.SecBench__A_Comprehensive_Multi_Dimensional_Benchmarking_Dataset_for_LLMs_in_Cybersecurity.pdf', 'page_number': 11, 'total_pages': 11, 'entry_id': 'http://arxiv.org/abs/2412.20787v1', 'title': 'SecBench: A Comprehensive Multi-Dimensional Benchmarking Dataset for LLMs in Cybersecurity', 'authors': ['Pengfei Jing', 'Mengyun Tang', 'Xiaorong Shi', 'Xing Zheng', 'Sen Nie', 'Shi Wu', 'Yong Yang', 'Xiapu Luo'], 'published': '2024-12-30', 'updated': '2024-12-30', 'primary_category': 'cs.CR', 'categories': ['cs.CR', 'cs.AI'], 'pdf_url': 'http://arxiv.org/pdf/2412.20787v1'}, text='SecBench: A Comprehensive Multi-Dimensional Benchmarking Dataset for LLMs in Cybersecurity 11\\n# Judgment Criteria\\nFor the first annotation task (i.e., 1. Assessing ability), you will follow the following criteria:\\n(1). Questions involving numerical calculations (e.g., encryption and decryption algorithms) must be Logical Reasoning\\nquestions.\\n(2). Questions involving specific code or Linux commands must be Logical Reasoning questions.\\n(3). If the question stem provides a hypothetical subject (e.g., specific names, a company, an organization, a security\\nanalyst) and describes this hypothetical subject in detail, and the question is designed based on this, then the question\\nmust be a Logical Reasoning question.\\n(4). Questions with relatively long stems are more likely to be Logical Reasoning questions (because the stem contains\\nmore information, often requiring the test taker to understand and reason). Conversely, questions with shorter stems\\nare more likely to be Knowledge Retention questions. Note: This criterion is not absolute and can only serve as an\\nauxiliary judgment criterion.\\n—\\n# Few-shot Examples\\n{ \"question\": \"Interface testing could involve which of the following?\", \"answers\": [\"The application programming\\ninterface (API)\", \"The graphical user interface (GUI)\", \"Both of the above\", \"None of the above\"], \"label\": \"C\", \"assessed\\nability\": \"Knowledge Retention\", \"assessed domain\": \"Application Security\", \"reason for labeling\": \"This question directly\\ntests whether the candidate understands the specific process of interface testing and does not involve complex logical\\nreasoning, so it is labeled as knowledge memory; interface testing is often related to specific applications, so it is\\nclassified under ’Application Security’. \" }\\n{ \"question\": \"Two-key triple DES encryption: C=CK1[DK2[EK1[P]]], K1≠K2, where the effective key is ()\", \"answers\":\\n[\"56\", \"128\", \"168\", \"112\"], \"label\": \"D\", \"assessed ability\": \"Logical Reasoning\", \"assessed domain\": \"Data Security\", \"reason\\nfor labeling\": \"This question requires the candidate to understand DES encryption and calculate the correct answer\\nbased on the question stem, involving a logical reasoning process, so it is labeled as logical reasoning; the question\\ntests encryption algorithms and is directly related to data security, so it is classified under ’Data Security’. \" }\\n...\\n—\\nNow please annotate the following question:\\n{Input Question}\\nSAQ Grading.Following is the prompt that the grading agent (implemented on GPT-4o-mini in\\nour work) used to grade the LLM’s output for benchmarking SAQs.\\nPlease help me grade a student’s answers in the network information security exam. I will provide you with a dataset\\nthat contains three parts: 1. The question stem and specific question, 2. The standard answer (i.e., the full score answer),\\n3. The student’s answer (to be graded). Specifically, you will perform the following steps:\\n1. For each question, read the question stem and understand the content of the question.\\n2. For each question, read the student’s answer and compare it with the standard answer.\\n3. For each question, based on the differences between the student’s answer and the standard answer, grade the\\nstudent’s answer. The question is scored on a 10-point scale, with a full score of 10 points.\\nRecord and return the student’s score for each question in the form of a JSON file.\\n—\\nYour output should only contain a JSON file in the following format, where each data entry only includes the student’s\\nscore, with the key being \"score\" and the value being an integer between 0 and 10, inclusive, for example:\\n[{\"model_score\": 6}]\\n—\\nBelow are the questions you need to process, consisting of three parts: 1. The question stem and specific question, 2.\\nThe standard answer (i.e., the full score answer), 3. The student’s answer (to be graded). Please grade based on the data\\nbelow and return the JSON file in the format mentioned above:\\n1. The question stem: {Question Stem from SecBench}\\n2. The standard answer (i.e., the full score answer): {Ground Truth from SecBench}\\n3. The student’s answer (to be graded): {LLM’s output to be graded}'),\n",
-       " Document(metadata={'file_path': 'arxiv_papers/2412.13420v1.BotSim__LLM_Powered_Malicious_Social_Botnet_Simulation.pdf', 'page_number': 1, 'total_pages': 21, 'entry_id': 'http://arxiv.org/abs/2412.13420v1', 'title': 'BotSim: LLM-Powered Malicious Social Botnet Simulation', 'authors': ['Boyu Qiao', 'Kun Li', 'Wei Zhou', 'Shilong Li', 'Qianqian Lu', 'Songlin Hu'], 'published': '2024-12-18', 'updated': '2024-12-18', 'primary_category': 'cs.SI', 'categories': ['cs.SI'], 'pdf_url': 'http://arxiv.org/pdf/2412.13420v1'}, text='BotSim: LLM-Powered Malicious Social Botnet Simulation\\nBoyu Qiao1,2, Kun Li1*, Wei Zhou1, Shilong Li1,2, Qianqian Lu1, Songlin Hu1,2\\n1Institute of Information Engineering, Chinese Academy of Sciences\\n2School of Cyber Security, University of Chinese Academy of Sciences\\n{qiaoboyu, likun2, zhouwei, lishilong, luqianqian, husonglin}@iie.ac.cn\\nAbstract\\nSocial media platforms like X(Twitter) and Reddit are vital\\nto global communication. However, advancements in Large\\nLanguage Model (LLM) technology give rise to social media\\nbots with unprecedented intelligence. These bots adeptly sim-\\nulate human profiles, conversations, and interactions, dissem-\\ninating large amounts of false information and posing signif-\\nicant challenges to platform regulation. To better understand\\nand counter these threats, we innovatively design BotSim, a\\nmalicious social botnet simulation powered by LLM. Bot-\\nSim mimics the information dissemination patterns of real-\\nworld social networks, creating a virtual environment com-\\nposed of intelligent agent bots and real human users. In the\\ntemporal simulation constructed by BotSim, these advanced\\nagent bots autonomously engage in social interactions such as\\nposting and commenting, effectively modeling scenarios of\\ninformation flow and user interaction. Building on the Bot-\\nSim framework, we construct a highly human-like, LLM-\\ndriven bot dataset called BotSim-24 and benchmark multi-\\nple bot detection strategies against it. The experimental re-\\nsults indicate that detection methods effective on traditional\\nbot datasets perform worse on BotSim-24, highlighting the\\nurgent need for new detection strategies to address the cyber-\\nsecurity threats posed by these advanced bots.\\nCode — https://github.com/QQQQQQBY/BotSim\\nIntroduction\\nIn the modern digital era, online social networks (OSNs)\\nsuch as X (formerly Twitter), and Reddit have become es-\\nsential mediums for shaping human interaction due to their\\nextensive connectivity and real-time information exchange.\\nHowever, the prevalence of bots on these platforms poses\\na significant threat to OSN security (Cresci 2020; Ferrara\\n2023). For example, social bots have played notable roles\\nin major events like presidential elections (Guglielmi 2020;\\nPacheco 2024) and global pandemics (Gallotti et al. 2020;\\nHimelein-Wachowiak et al. 2021), where they disseminate\\nmisinformation and sway public opinion. Previous instances\\nof social bots primarily stem from rule-based programs,\\nhowever, recent advancements have integrated large lan-\\nguage models (LLMs) that endow bots with more sophis-\\n*Corresponding Author.\\nCopyright © 2025, Association for the Advancement of Artificial\\nIntelligence (www.aaai.org). All rights reserved.\\nticated, human-like capabilities (Yang and Menczer 2024).\\nThis development has further intensified the problem of in-\\nformation pollution on OSNs (Sun et al. 2024). Therefore,\\nupgrading current detection systems and understanding the\\ncharacteristics of LLM-driven bots has become a critical pri-\\nority.\\nPrevious research methods have predominantly been de-\\nveloped using traditional bot datasets. For instance, Yang\\net al. (2020) proposed a method that exploits differences\\nin user profiles, while Cresci et al. (2016) suggested iden-\\ntifying the longest common subsequence of user actions.\\nWith advancements in deep learning, new methods have\\nemerged focusing on text semantic content and user inter-\\naction networks. Wei et al. (2019) introduced the use of re-\\ncurrent neural networks (RNNs) to encode posts and detect\\nbots based on their semantic content. More recent meth-\\nods, such as RGT (Feng et al. 2022), and BECE (Qiao\\net al. 2024) have employed graph neural networks (GNNs)\\nand graph-enhanced strategies to improve detection perfor-\\nmance. However, LLM-powered bots exhibit greater logical\\ncoherence and human-like qualities in profiles, text content,\\nand interaction strategies, posing significant challenges to\\nthese existing detection methods (Feng et al. 2024; Ferrara\\n2023). Therefore, collecting datasets of LLM-driven bots\\nis essential for developing new detection techniques (Yang\\nand Menczer 2024). Traditional dataset collection methods,\\nhowever, encounter the following two major challenges:\\n(1) Intelligent Challenges and Decline in Labeling Qual-\\nity: The intelligence of LLM-driven bots has significantly\\nadvanced, making manual annotation tasks much more chal-\\nlenging and leading to a notable decline in annotation qual-\\nity (Zhang et al. 2024). For instance, crowdsourcing tests\\nconducted by Cresci et al. (2017) revealed that manual an-\\nnotators had an accuracy rate of less than 24% when labeling\\nsocial spam bots. Consequently, manual annotation has be-\\ncome unreliable, impairing the ability of detection models to\\ndifferentiate between bots and genuine users.\\n(2) Ethical Constraints: For ethical reasons, large-scale de-\\nployment of social bots disguised as humans in real social\\nnetworks to obtain genuine annotations for research is sub-\\nject to strict restrictions. This situation makes research more\\ncomplex and challenging.\\nTo address these challenges, we design a scalable ma-\\nlicious social botnet simulation framework called BotSim,\\narXiv:2412.13420v1  [cs.SI]  18 Dec 2024'),\n",
-       " Document(metadata={'file_path': 'arxiv_papers/2412.13420v1.BotSim__LLM_Powered_Malicious_Social_Botnet_Simulation.pdf', 'page_number': 2, 'total_pages': 21, 'entry_id': 'http://arxiv.org/abs/2412.13420v1', 'title': 'BotSim: LLM-Powered Malicious Social Botnet Simulation', 'authors': ['Boyu Qiao', 'Kun Li', 'Wei Zhou', 'Shilong Li', 'Qianqian Lu', 'Songlin Hu'], 'published': '2024-12-18', 'updated': '2024-12-18', 'primary_category': 'cs.SI', 'categories': ['cs.SI'], 'pdf_url': 'http://arxiv.org/pdf/2412.13420v1'}, text='upon which we construct an accurately labeled, LLM-driven\\nbot dataset named BotSim-24. This dataset includes both\\nreal human accounts and LLM-driven agent bot accounts.\\nTo enhance the dataset’s complexity, we implement a series\\nof disguise techniques based on detection methods proposed\\nin previous research focusing on bot profiles (Yang et al.\\n2020), textual content (Qiao et al. 2023), and interaction\\nbehavior patterns (Li et al. 2023). By leveraging LLMs to\\nanalyze and simulate characteristics of real users, we con-\\nstruct a comprehensively disguised and highly human-like\\nLLM-driven bot dataset to expose and challenge the limita-\\ntions and weaknesses of existing detection methods. We then\\nbenchmark multiple bot detection strategies on the BotSim-\\n24 dataset. The experimental results validate the effective-\\nness of the dataset and underscore the significant threat that\\nadvanced bots pose to network security.\\nOur contributions can be summarized as follows:\\n• BotSim Framework: We are the first to propose a\\nscalable LLM-driven malicious social botnet simula-\\ntion framework, BotSim. This environment enables re-\\nsearchers to continuously track the latest bot evolution\\nstrategies and generate up-to-date datasets, thereby ad-\\nvancing the development of new detection methods.\\n• LLM-Driven Bot Dataset: Leveraging the BotSim sim-\\nulation framework, we meticulously construct a bot de-\\ntection dataset based on interaction scenarios from Red-\\ndit. This dataset incorporates real Reddit users and LLM-\\ndriven bot accounts, providing a comprehensive range of\\ninteraction data that enhances existing resources for so-\\ncial bot detection research.\\n• Experimental Evaluation: We conduct extensive exper-\\niments on the BotSim-24 dataset to evaluate the perfor-\\nmance of various social bot detection models. The results\\nshow that detection methods effective on traditional bot\\ndatasets perform poorly on BotSim-24, highlighting the\\nurgent need for new detection strategies to address the\\ncybersecurity threats posed by these advanced bots.\\nBotSim: Botnet Simulation Framework\\nThe overall framework of BotSim is shown in Figure 1, and\\nit aims to model the activity characteristics and behavior pat-\\nterns of LLM-driven malicious social bots in OSNs. BotSim\\nconsists of four components: the social environment, envi-\\nronmental perception, action list, and agent decision center.\\nPreliminaries\\nIn this paper, we aim to use a botnet simulation framework\\nto model the activity characteristics and behavior patterns of\\nLLM-driven malicious bots on OSNs. The BotSim frame-\\nwork includes two types of users: human accounts from real\\nsocial ecosystems, denoted as UH = {Uh1 , Uh2 , ..., Uhn }\\nand LLM-driven agent bot accounts, denoted as UB =\\n{Ub1 , Ub2 , ..., Ubm }, where n and m represent the number\\nof humans and bots, respectively. To simulate the continu-\\nous passage of time and the dynamic changes in interaction\\ntiming in real OSNs, we set up a timeline mechanism T =\\n{t1, t2, ..., tn}. In the timeline process, the set of interactions\\nbetween users is represented as D = {UB, UH, E, T} with\\nE = {e1, e2, ..., en} denoting the set of interaction relation-\\nships among users.\\nSocial Environment\\nThe social environment of BotSim is built from real social\\nmedia ecosystem data and consists of account collection,\\nmessage feeding, timeline setup, and interaction mode.\\nAccount Collection The account collection includes real\\nhuman accounts UH and virtual Agent bot accountsUB. Hu-\\nman accounts are sourced from data collected in real social\\nenvironments, while the configuration and behavior of agent\\nbot accounts are constructed by LLM-driven agents.\\nMessage Feeding Message feeding utilizes a dual-filtering\\nmechanism based on timelines and recommendation func-\\ntions. Initially, the message flow is filtered through the time-\\nline, and then it is optimally ranked by the recommendation\\nfunction to produce the final message stream.\\nTimeline Setup The timeline setup T = {t1, t2, ..., tn}\\nensures the environment operates according to a predefined\\ntimeline logic. Additionally, each agent bot has its dedicated\\ntimeline, which is determined by the bot’s activities and in-\\nteractions with other accounts to meet the need for rapid\\nsimulation of long-time-span interactions.\\nInteraction Mode The interaction patterns E =\\n{e1, e2, ..., en} must adhere to the interaction settings de-\\nfined by the specific social media platform. Interactions be-\\ntween accounts are accompanied by message flow outputs,\\nsuch as likes and comments on current messages.\\nEnvironment Perception\\nThe environment perception mechanism is important in the\\noperation of BotSim, which helps the agent to capture the\\ndynamic changes of the social environment and accurately\\ntransfer the perceived multi-dimensional information to the\\nagent decision center so that the agent can make adaptive\\ndecisions based on the environmental information.\\nIn BotSim, account profiles, message stream updates, and\\ncomplex interaction data collectively form the core elements\\nof the social environment. To enhance the agents’ under-\\nstanding and responsiveness to these complex environments,\\nwe have designed clear and structured prompts to assist\\nthe LLM in comprehending environmental information. De-\\ntailed prompts can be found in Appendix B.1.\\nAction List\\nThe action list integrates commonly used information dis-\\nsemination interactions on social media, including the fol-\\nlowing actions: (1) Create User: Create a new user profile.\\n(2) Post: Generate and publish original content based on\\nbackground knowledge and preferences. (3) Comment: Re-\\nply to selected posts or comments. (4) Repost: Share posts\\nto achieve targeted information dissemination. (5) Like:\\nLike posts to enhance positive feedback during interactions.\\n(6) Browse: Continue browsing the message stream based\\non the internal timeline if no preferred content is found. (7)\\nEnd: Complete the mission and terminate the action.'),\n",
-       " Document(metadata={'file_path': 'arxiv_papers/2412.13420v1.BotSim__LLM_Powered_Malicious_Social_Botnet_Simulation.pdf', 'page_number': 3, 'total_pages': 21, 'entry_id': 'http://arxiv.org/abs/2412.13420v1', 'title': 'BotSim: LLM-Powered Malicious Social Botnet Simulation', 'authors': ['Boyu Qiao', 'Kun Li', 'Wei Zhou', 'Shilong Li', 'Qianqian Lu', 'Songlin Hu'], 'published': '2024-12-18', 'updated': '2024-12-18', 'primary_category': 'cs.SI', 'categories': ['cs.SI'], 'pdf_url': 'http://arxiv.org/pdf/2412.13420v1'}, text='Social Media Platform \\nReal Datasets\\nAccounts Messages\\nInteraction\\nSocial Environment\\nEnvironment Perception\\nSocial\\nNetwork\\nPosts\\nComments\\nUser \\nProfiles\\ndoomed\\n@doomed\\nOrlando, FL\\nNovember 2008\\nGreat article  for ....\\nAnna\\n9 Minutes ago\\nZoe\\na few seconds ago\\nWhen I first ...\\nCreate Post\\nLike Comment\\nRepost ...\\nAction List\\nRole\\nMemory\\nGoal\\nPerception\\nActions\\nKnowledge Relevant historical \\ncontent\\nMemory\\nHistorical Posts\\nHistorical Comments\\nGoal Tasks\\nGoal\\nAction 1\\nAction n\\nbreak \\ndown\\nAgent\\nAgent Decision CenterPrompt\\n  Large Language \\n  Model (LLM)\\nDecision Making\\nReasoning\\n/Responsing\\nTimeline\\n...\\nUser Info\\n Relationship\\nFigure 1: The overall framework of BotSim.\\nBotSim provides a list of commonly used actions for in-\\nformation dissemination across various OSNs. Future re-\\nsearch can select the appropriate actions based on specific\\nneeds and add new actions as required. Detailed description\\nof the action list in Appendix B.2.\\nAgent Decision Center\\nThe Agent Decision Center, as the core component of Bot-\\nSim, integrates multidimensional information including goal\\ntasks, role settings, background knowledge, environmental\\nperception, action lists, and memory data. Its primary func-\\ntion is to accurately plan and execute action decisions, driv-\\ning the comprehensive operation of BotSim.\\nGoal Tasks Goal tasks G define the specific needs for\\ninformation dissemination and guide the agent’s actions.\\nThe operators set these goals, and then the LLM decom-\\nposes the goal tasks into manageable and planned actions\\nPA = {pa1, pa2, ..., pak} to ensure the goals are achieved.\\nPrompts for goal tasks are detailed in Appendix B.3.\\nRole Setting Role settings are crucial for the agent’s\\ndecision-making process and include multidimensional at-\\ntributes such as age, name, gender, preferences, education\\nlevel, description, and geographic location. These attributes\\nare applied to the profiles of created user accounts to help the\\nagent establish a persona, enhancing both emotional expres-\\nsion and decision-making accuracy. More detailed informa-\\ntion on role setup is provided in Appendix B.2.\\nBackground Knowledge Given that LLMs may struggle\\nto capture new social dynamics and knowledge, providing\\nbackground knowledge KL can help LLMs generate rele-\\nvant and novel content that aligns with goal tasks.\\nMemory Mechanism The memory mechanism filters rele-\\nvant posts and comments related to the current task from the\\nagent’s historical records. This mechanism assists the agent\\nin responding appropriately. An example of memory infor-\\nmation is presented in Appendix B.4.\\nBotSim Execution Process\\nThe overall execution process of the agent bots in BotSim in-\\nvolves the following steps: (1) Specify the Platform: Iden-\\ntify the social media platform to be simulated and gather the\\nrelevant data, including user profiles, messages, timestamps,\\nand interaction data. (2) Define Goal Tasks: Clearly out-\\nline the goal tasks and compile the necessary background\\nknowledge. (3) Break Down Tasks: Decompose the goal\\ntasks into a series of executable actions, as detailed in Ap-\\npendix B.3. (4) Formulate Environment Prompts: Per-\\nceive changes in the simulated social environment and cre-\\nate appropriate prompts, as detailed in Appendix B.1. (5)\\nRetrieve Memory Data: Access historical posts and com-\\nments relevant to the goal task. (6) Construct and Execute\\nPrompts: Build prompts using environmental perception in-\\nformation, memory data, planned action sequences, role set-\\ntings, and background knowledge. Use these prompts to in-\\nstruct the LLM, which will return the required action param-\\neters. (7) Update and Monitor: Refresh the social environ-\\nment and track the progress of the action sequence. If not\\ncompleted, return to step (4). If completed, proceed to step\\n(8). (8) End: Conclude the execution.\\nA complete prompt example is provided in Appendix B.4,\\nand the algorithm for this execution process is further ex-\\nplained in Appendix B.5.\\nBotSim-24: LLM-driven Bot Detection Dataset\\nIn this section, we present BotSim-24, a bot detection dataset\\npowered by LLM. Building on the BotSim framework, we\\nsimulate information dissemination and user interactions\\nacross six SubReddits on Reddit. This process results in the'),\n",
-       " Document(metadata={'file_path': 'arxiv_papers/2412.13420v1.BotSim__LLM_Powered_Malicious_Social_Botnet_Simulation.pdf', 'page_number': 4, 'total_pages': 21, 'entry_id': 'http://arxiv.org/abs/2412.13420v1', 'title': 'BotSim: LLM-Powered Malicious Social Botnet Simulation', 'authors': ['Boyu Qiao', 'Kun Li', 'Wei Zhou', 'Shilong Li', 'Qianqian Lu', 'Songlin Hu'], 'published': '2024-12-18', 'updated': '2024-12-18', 'primary_category': 'cs.SI', 'categories': ['cs.SI'], 'pdf_url': 'http://arxiv.org/pdf/2412.13420v1'}, text='creation of the BotSim-24 dataset, which includes 1,907 hu-\\nman accounts and 1,000 LLM-driven agent bot accounts.\\nPre-Prepared Data\\nWe first introduce the real OSN data information that must\\nbe pre-prepared for the BotSim simulation.\\nReddit Social Environment Data Collection We choose\\nsix popular news-related SubReddits on Reddit to construct\\nthe social environment data for BotSim: “worldnews”, “pol-\\nitics”, “news”, “InternationalNews”, “UpliftingNews” and\\n“GlobalTalk”. We collect posts, first- and second-level com-\\nments, timestamps, and user profiles from these six SubRed-\\ndits between June 20, 2023, and June 19, 2024. We filter and\\nannotate the collected accounts, resulting in 1,907 human\\nReddit accounts. More detailed data filtering and statistical\\ninformation are presented in Appendix C.1.\\nGoal Tasks Our goal task is to create agent bots designed\\nto spread disinformation within six news-oriented SubRed-\\ndits. We focus on three highly debated international news\\nevents from 2023 to 2024: the “Russia-Ukraine war,” the\\n“Israeli-Palestinian conflict,” and “U.S. politics.” Our objec-\\ntive is to disseminate disinformation related to these topics\\nwhile concealing our activities by posting and engaging in\\ndiscussions about a broad spectrum of international news on\\nthe SubReddits.\\nBackground Knowledge Collection To build the knowl-\\nedge base for the three major news events and vari-\\nous international news used for our goal tasks, we col-\\nlect real news from four authoritative international news\\nsources —“BBC”, “NBC News”, “NYTimes”, and “Peo-\\nple’s Daily”, as well as fact-checking sites “Truthorfiction”\\nand “Snopes”. The data spans from June 2023 to June 2024.\\nThis knowledge base helps the LLM generate content that is\\nmost relevant to the goal tasks. More detailed statistics are\\nin Appendix C.2.\\nUser Role Role settings in BotSim are used to construct\\nthe profiles of agent bots. Usernames and descriptions are\\ngenerated by LLM simulation cases, while age, gender, ed-\\nucation level, and geographic location are randomly as-\\nsigned based on weighted statistics from Reddit1. Addition-\\nally, since the goal tasks involve international news, political\\nideology settings are included in the role settings 2. This in-\\nformation is intended to assist the agent Bots in interactions,\\nbut the BotSim-24 dataset only provides profile information\\nrelevant to Reddit.\\nBot Data Construction\\nPrevious detection methods have primarily focused on iden-\\ntifying bot accounts that lack sufficient anthropomorphic\\nfeatures in areas such as profile metadata (Value or Boolean\\ninformation) (Cresci et al. 2016; Moghaddam and Ab-\\nbaspour 2022; Beskow and Carley 2018), textual content\\n(Qiao et al. 2023; Liu et al. 2023), and interaction patterns\\n(Feng et al. 2021b; Peng et al. 2022). Our goal is to create\\nhighly human-like bot accounts, driven by LLMs and based\\n1https://explodingtopics.com/blog/reddit-users\\n2https://news.gallup.com/poll/388988/political-ideology-\\nsteady-conservatives-moderates-tie.aspx\\nSubReddit Posts Users 1-Coms 2-Coms\\nworldnews 14,626 1,405 15,740 859\\npolitics 2,4074 1,744 39,704 3,155\\nnews 8,465 1,471 11,685 441\\nInternationalNews 3,906 554 5,477 311\\nUpliftingNews 1,219 266 1,148 35\\nGlobalTalk 342 342 472 16\\nTotal 52,632 2,907 74,226 4,817\\nTable 1: Distribution of users, posts, and comments among\\nsix SubReddits. ‘1-Coms’ means first-level comments, ‘2-\\nComs’ means second-level comments. The total number of\\nusers is not the sum of users participating in different Sub-\\nReddits, but the number of accounts participating in the so-\\ncial environment.\\non the BotSim framework, to challenge these detection al-\\ngorithms. To achieve this, the bots must effectively disguise\\nthemselves in these key areas to evade detection.\\nThe disguise strategies we implement for bot accounts are\\nas follows: (1) Metadata Disguise: We statistically analyze\\nsix types of value-type metadata from real Reddit users, in-\\ncluding the number of posts, the number of first-level com-\\nments, the number of second-level comments, the ratio of\\nposts to comments, posting frequency, and the number of\\nactive SubReddits. We then use LLM to integrate this sta-\\ntistical information to generate human-like metadata for bot\\naccounts, effectively achieving metadata disguise. (2) Tex-\\ntual Content Disguise: The posts and comments of bot ac-\\ncounts are generated by LLMs based on contextual knowl-\\nedge, user role information, browsing content, and other\\nrelevant factors. Unlike traditional bots, which often pro-\\nduce posts and comments with inconsistent contextual se-\\nmantics, LLM-driven bots utilize advanced text understand-\\ning and generation capabilities to create contextually co-\\nherent and logically sound content, effectively disguising\\nthe textual output. (3) Interaction Disguise: On BotSim\\nReddit, interactions between accounts include first-level and\\nsecond-level replies. The specific posts or comments that\\nbot accounts reply to are autonomously determined by the\\nLLM based on the goal task and browsed information. This\\nmethod leverages the LLM’s analytical capabilities, distin-\\nguishing it from previous rule-based settings, and thereby\\nachieving interaction disguise. We present a more detailed\\ndata statistical analysis and the process of constructing bot\\ndata in Appendix C.3 and C.4.\\nAfter setting up the data information and construction\\nstrategies required for BotSim, we selected GPT4o-mini as\\nthe LLM for generating the BotSim-24 dataset. The BotSim-\\n24 contains users’ profiles, post and comment information,\\nand relationship information. We present the statistical in-\\nformation of the constructed BotSim-24 dataset in Table 1.\\nDataset Process\\nIn this section, we describe the construction of user features\\nand relationships in the BotSim-24 dataset.')]"
+       "[Document(metadata={'file_path': 'more_arxiv/2501.00855v1.What_is_a_Social_Media_Bot__A_Global_Comparison_of_Bot_and_Human_Characteristics.pdf', 'page_number': 1, 'total_pages': 33, 'entry_id': 'http://arxiv.org/abs/2501.00855v1', 'title': 'What is a Social Media Bot? A Global Comparison of Bot and Human Characteristics', 'authors': ['Lynnette Hui Xian Ng', 'Kathleen M. Carley'], 'published': '2025-01-01', 'updated': '2025-01-01', 'primary_category': 'cs.CY', 'categories': ['cs.CY', 'cs.AI', 'cs.SI'], 'pdf_url': 'http://arxiv.org/pdf/2501.00855v1'}, text='What is a Social Media Bot? A Global Comparison of\\nBot and Human Characteristics\\nLynnette Hui Xian Ng1,* and Kathleen M. Carley1\\n1Center for Informed Democracy & Social - cybersecurity (IDeaS), Societal and Software Systems Carnegie Mellon\\nUniversity, Pittsburgh, PA 15213\\n*lynnetteng@cmu.edu\\nABSTRACT\\nChatter on social media about global events comes from 20% bots and 80% humans. The chatter by bots and humans is\\nconsistently different: bots tend to use linguistic cues that can be easily automated (e.g., increased hashtags, and positive\\nterms) while humans use cues that require dialogue understanding (e.g. replying to post threads). Bots use words in categories\\nthat match the identities they choose to present, while humans may send messages that are not obviously related to the\\nidentities they present. Bots and humans differ in their communication structure: sampled bots have a star interaction structure,\\nwhile sampled humans have a hierarchical structure. These conclusions are based on a large-scale analysis of social media\\ntweets across ∼ 200 million users across 7 events.\\nSocial media bots took the world by storm when social-cybersecurity researchers realized that social media users not only\\nconsisted of humans, but also of artificial agents called bots. These bots wreck havoc online by spreading disinformation and\\nmanipulating narratives. However, most research on bots are based on special-purposed definitions, mostly predicated on the\\nevent studied. In this article, we first begin by asking, “What is a bot?\", and we study the underlying principles of how bots are\\ndifferent from humans. We develop a first-principle definition of a social media bot. This definition refines existing academic\\nand industry definitions: “A Social Media Bot is An automated account that carries out a series of mechanics on social media\\nplatforms, for content creation, distribution and collection, and/or for relationship formation and dissolutions.\" With this definition\\nas a premise, we systematically compare the characteristics between bots and humans across global events, and reflect on\\nhow the software-programmed bot is an Artificial Intelligent algorithm, and its potential for evolution as technology advances.\\nBased on our results, we provide recommendations for the use of bots and for the regulation of bots. Finally, we discuss three\\nopen challenges and future directions of the study of bots: Detect, to systematically identify these automated and potentially\\nevolving bots; Differentiate, to evaluate the goodness of the bot in terms of their content postings and relationship interactions;\\nDisrupt, to moderate the impact of malicious bots, while not unsettling human conversations.\\nIntroduction\\nThe notion of “bots” on social media is ubiquitous across many scholarship. These studies captured a range of different social\\nphenomena where bots operate: politics, hate speech, toxicity and so forth. Bots were used to boost the follower count of\\npoliticians in the 2011 Arab Springs uprising, generating false impressions of popularity1, 2. In the same uprising, bots flooded\\nnews streams to interrupt efforts of political dissidents1, 2. In the US 2020 elections, bots augmented human users in strategic\\ncommunications, and actively distorted or fabricated narratives to create a polarized society3–5. More recently, bots aggressively\\npushed anti-vaccine narratives and conspiracy theories on social media during the 2021 coronavirus pandemic6, 7. Bots applied\\nsocial pressure to influence humans to favor the anti-vaccine ideology 3, 8. When the tension of the online ideologies are\\nsufficiently strong, and the spread sufficiently wide, these ideologies spillover to the offline world, resulting in protests, riots\\nand targeted hate-speech9–12. Social media bots gained further media attention in 2022 when Elon Musk proclaimed that at\\nleast 20% of the Twitter users were bots, which were influencing content quality13. Musk later bought the platform, and took\\nsteps to curtail the bot population in a “global bot purge\", which includes removing huge amounts of bots, and charging newly\\ncreated accounts to post and interact on the platform14.\\nMuch research on social media bots involve constructing bot detection algorithms and applying bot detection algorithms to\\nanalyze bot activity during an event. Bot detection algorithms typically extract a series of features from user and post data,\\nthen build a supervised machine learning model which classifies the likelihood of a user being a bot or a human 15. These\\nmachine learning models range from logistic regression16, to random forests17, to ensemble of classifiers4, 18, to deep learning\\nmethods19, 20. Another technique of bot detection is graph-based methods, which infers the probability of a user being a bot by\\nits connections, i.e. friends21, 22. Most recently, Large Language Models (LLMs) are incorporated in bot detection algorithms\\nto handle the diverse user information and content modalities23. These bot detection classifiers have been used to study bot\\nbehavior in many events, ranging from political events4, 24–26 to natural disasters27, 27 to the spread of information and opinions\\n1\\narXiv:2501.00855v1  [cs.CY]  1 Jan 2025'),\n",
+       " Document(metadata={'file_path': 'more_arxiv/2501.00855v1.What_is_a_Social_Media_Bot__A_Global_Comparison_of_Bot_and_Human_Characteristics.pdf', 'page_number': 2, 'total_pages': 33, 'entry_id': 'http://arxiv.org/abs/2501.00855v1', 'title': 'What is a Social Media Bot? A Global Comparison of Bot and Human Characteristics', 'authors': ['Lynnette Hui Xian Ng', 'Kathleen M. Carley'], 'published': '2025-01-01', 'updated': '2025-01-01', 'primary_category': 'cs.CY', 'categories': ['cs.CY', 'cs.AI', 'cs.SI'], 'pdf_url': 'http://arxiv.org/pdf/2501.00855v1'}, text='on social media8, 28, 29.\\nAlthough researchers have built automatic bot detection classifiers, behavioral studies show that humans are unable to\\ndifferentiate between the bot and human user 30. In fact, the identification of bots by security students are akin to random\\nguesses31. Consequently, it is important to study bots and their characteristic nature and activity patterns. Our study is\\npositioned within the social cybersecurity realm, which studies how the digital environment, particularly bots, can be exploited\\nto alter the content and community relationships32.\\nThis article is being driven by looking at a first principles approach to bots. We ask the following research questions:\\n• RQ1: What is a social media bot? Many studies are predicated on a general purpose understanding of a bot, or a\\nspecific definition particular to the event of study. Instead, we pare down the definition of a bot into its treatment of the\\ncore elements of a social media platform (users, content, relationships).\\n• RQ2: How does the nature of a bot differ from a human? Systematically, we look at the difference between bots\\nand humans. We use a large scale dataset from Twitter/X that spans over 200 million users, and analyzed four aspects:\\nvolume of bot/human user types, use of linguistic cues, use of identity terms, and social interactions.\\nAfter an examination of the social media bot, we discuss how a bot is also an Artificial Intelligent (AI) agent, and its\\npotential evolution alongside technological advancements. We finally follow with a discussion of the open research challenges\\nof the study of bots to encourage future studies in this field. The challenges we identify reflect the definition of a bot: Detect, to\\nsystematically identify these automated and evolving bots; Differentiate, to evaluate the goodness of the bot in terms of their\\ncontent postings and relationship interactions; and Disrupt, to moderate the impact of malicious bots, while not unsettling\\ndigital human communities.\\nWhat is a Social Media Bot?\\nThe term “bot\" has become a pervasive metaphor for inauthentic online users8, 33. Most social media users have an implicit\\nunderstanding of a bot, as do most researchers30. Table 2 summarizes some of the recent definitions of bots retrieved from\\nacademic literature. The security industry also watches social media bot accounts, and Table 3 summarizes definitions from\\nindustry sources. Each of the definition grasps one or more relevant properties (highlighted in bold) of a social media bot, yet\\nare not sufficiently comprehensive to describe the bot. Some of these relevant properties are: “automated\", “interacts with\\nhumans\", “artificial agents\".\\nOne of the problems with existing definitions is that they often define bots as being malicious and they highlight the nefarious\\nuse of bots5, 34–37: “malicious actors\", “public opinion manipulation\", “malicious tasks\"18, 38, 39. Most often, the study of bots\\nis established upon nefarious tasks: election manipulation, information operations, even promoting extremism 33, 34, 40. The\\nexact same technology can be used in both good and bad ways. There are plenty of good bots41–43. Bots provide notifications\\nand entertainment44, such as the @CoronaUpdateBot found in our dataset which posts critical public health information. Bots\\nsupport crisis management efforts by gathering the needs and combined locations of people after a disaster, for authorities and\\ncommunity volunteers to identify crucial areas and providing help45. Chat bots provide emotional support during stress46 and\\ncontinue bonds in times of grieve47. Celebrities and organizations use bots to facilitate strategic communications with their fans\\nand clients3, 48.\\nIn essence, a bot is a computer algorithm. As an algorithm, a bot is neither bad or good. It is the persona it is afforded to\\nthat determines the goodness of its use. We develop a generic definition of a bot. The definition is independent of the use of the\\nbot. The determination of the use of the bot warrants separate treatment beyond this paper. Regardless of whether a bot is used\\nfor good or ill, the behavioral characteristics of bots remain the same.\\nTo better describe the social media bot, we first need to characterize the environment in which it lives: the social media\\nplatform. Within a social media platform, there are three main elements: users, content and relationships49. Users are represented\\nby their virtual accounts, and are the key actors driving the system, creating and distributing information. Content is the\\ninformation generated by users on the platform. Relationships are formed from the user-user, user-content and content-content\\ninteractions.\\nAfter distilling a social media platform into its core building blocks, it follows that definition of a social media bot should\\nbe based on the foundations of a social media platform as first principles. The presence of these components in each of the\\nreference definitions are broken down in Table 4. The first principles of a bot are:\\n• User: An automated account that carries out a series of mechanics. A key characteristic of bots is its programmability,\\nwhich give it its artificial and inauthentic characteristic. Automation is an aspect that has been iterated in all the reference\\ndefinitions. The key here is that a bot is automated. The model underlying the automation does not matter; any model\\ncan be applied equally well to humans and bot. A bot could be built to mimic humans, or it could be built to optimize\\n2/33'),\n",
+       " Document(metadata={'file_path': 'more_arxiv/2501.00855v1.What_is_a_Social_Media_Bot__A_Global_Comparison_of_Bot_and_Human_Characteristics.pdf', 'page_number': 3, 'total_pages': 33, 'entry_id': 'http://arxiv.org/abs/2501.00855v1', 'title': 'What is a Social Media Bot? A Global Comparison of Bot and Human Characteristics', 'authors': ['Lynnette Hui Xian Ng', 'Kathleen M. Carley'], 'published': '2025-01-01', 'updated': '2025-01-01', 'primary_category': 'cs.CY', 'categories': ['cs.CY', 'cs.AI', 'cs.SI'], 'pdf_url': 'http://arxiv.org/pdf/2501.00855v1'}, text='other functions. For example, some studies describe bots in terms of its mimicry of humans33, 50, but others observe that\\nbots eventually influence the social space such that humans mimic bots51, 52.\\n• Content: for content creation, distribution, and collection and processing. Bots often generate their content in bulk to\\ndistribute a certain ideology24, 53, such as a good portrayal of their affiliated country54. Instances where bots perform\\ncontent distribution is where the spread fake news and disinformation content28, 55, 56, or when they distribute general\\nnews information57. Bots in the form of web crawlers and scrapers download and index data from social media in bulk58,\\nand sometimes process the data to perform functions like analyzing sentiment of opinions59.\\n• Relationships: for relationship formation and dissolution. In other words, forming a relationship online means to connect\\nwith other users via post-based (i.e., retweet, mention) or user-based (i.e., following, friend) mechanisms. Dissolving\\na relationship means to destroy a connection by forcing users to leave a community. Bots are an actively form and\\ndestroy relationships on social media platforms. An example of the formation of post-based relationship is the active\\namplification of narratives. This technique is mostly employed in the political realm where the bots retweet political\\nideology in an organized fashion24, 60, 61. User-based relationships can grow through coordinated fake-follower bots,\\nthat are used to boost online popularity62, or can be dissolved through toxic bots that spread hate and directed religious\\nideologies40, 63, causing users to break away from the community64.\\nFigure 1 reflects a first principles definition of a social media bot. A Social Media Bot is “An automated account that carries\\nout a series of mechanics on social media platforms, for content creation, distribution, and collection and processing, and/or for\\nrelationship formation and dissolutions.\" This definition displays the possibilities of mechanics that the bot account can carry\\nout. A bot does not necessarily carry out all the mechanics. The combinations of mechanics that a bot carries out thus affects\\nthe type of bot it is and the role it plays within the social media space, and as shown in Table 1, those mechanics can be used for\\neither good or bad. Bot types can be named for their actions or for their content. For example, a bot that carries out relationship\\nformation between two different communities, and does not do any content mechanics can be classified as a bridging bot16. We\\nillustrate a few types of bots and their use for good and bad in Table 1. Note that this list is not meant to be an exhaustive list\\nbut an illustrative list of the variety of bots in the social media space.\\nFigure 1.Definition of Social Media Bot. This definition displays the possibilities of mechanics that the bot account can carry\\nout. A bot does not necessarily carry out all the mechanics.\\nType of Bot Use for Good Use for bad\\nGeneral Bot search engine optimization, data collection 58 spread disinformation29, manipulate opinion8\\nBridging Bot political commentators that aggregate informa-\\ntion16\\ncross-cultural social marketing, disseminate infor-\\nmation across community differences54\\nPolitical Bot “establishing brand and amplifying messages\"3, 25,\\ndigital campaigning16\\npolitical manipulation65\\nChat Bot emotional support during stress 46 and grieve47 Post offensive and inflammatory comments66\\nActivist Bot crisis management 45 trigger and initiate activism2, 9\\nTable 1.Illustration of type of bots and their role in the social media space. Note that this list is not exhaustive but illustrative.\\n3/33'),\n",
+       " Document(metadata={'file_path': 'more_arxiv/2501.00855v1.What_is_a_Social_Media_Bot__A_Global_Comparison_of_Bot_and_Human_Characteristics.pdf', 'page_number': 4, 'total_pages': 33, 'entry_id': 'http://arxiv.org/abs/2501.00855v1', 'title': 'What is a Social Media Bot? A Global Comparison of Bot and Human Characteristics', 'authors': ['Lynnette Hui Xian Ng', 'Kathleen M. Carley'], 'published': '2025-01-01', 'updated': '2025-01-01', 'primary_category': 'cs.CY', 'categories': ['cs.CY', 'cs.AI', 'cs.SI'], 'pdf_url': 'http://arxiv.org/pdf/2501.00855v1'}, text='Year Reference Definition\\n2016 33 A social bot is a computer algorithm that automatically\\nproduces content and interacts with humans on social\\nmedia, trying to emulate and possibly alter their behavior.\\n2016 26 [...] social bots, algorithmically controlled accounts\\nthat emulate the activity of human users but operate at\\nmuch higher pace (e.g., automatically producing content\\nor engaging in social interactions), while successfully\\nkeeping their artificial identity undisclosed\\n2016 50 Automated accounts, called bots, [...]\\n2018 58 Bots are have been generally defined asautomated agents\\nthat function on an online platform [..]. As some put it,\\nthese are programs that run continuously, formulate deci-\\nsions, act upon those decisions without human interven-\\ntion, and are able adapt to the context they operate in.\\n2018 67 The term “social bot” describes accounts on social media\\nsites that are controlled by botsand imitate human users\\nto a high degree but differ regarding their intent.\\n2018 17 [...] malicious automated agents\\n2020 20 Social Media Bots (SMB) are computer algorithms that\\nproduce content and interacts with users\\n2020 38 [...] social bots, (semi-) automatized accounts in social\\nmedia, gained global attention in the context of public\\nopinion manipulation.\\n2020 18 Malicious actors create inauthentic social media ac-\\ncounts controlled in part by algorithms, known as social\\nbots, to disseminate misinformation and agitate online\\ndiscussion.\\n2021 68 Social bots – partially or fully automated accounts on\\nsocial media platforms [...]\\n2022 69 Social media bots are automated accounts controlled by\\nsoftware algorithms rather than human users\\n2023 41 Social bots are automated social media accounts gov-\\nerned by software and controlled by humans at the back-\\nend.\\n2023 15 A bot is a software that mimics human behavior and oper-\\nates autonomously and automatically.\\n2023 70 Twitter accounts controlled by automated programs.\\n2023 71 Automated accounts on social media that impersonate\\nreal users, often called “social bots,”\\n2023 72 Social bots are social media accounts controlled by soft-\\nware that can carry out content and post content auto-\\nmatically.\\n2024 30 Social bots are artificial agents that infiltrate social media\\n2024 73 Social bots are social media accounts controlled in part\\nby software [...] Social media bots display profiles and\\nengage with others through various means, including fol-\\nlowing, liking, and retweeting\\nTable 2.Definitions of “Social Media Bot\" in academic literature.\\n4/33'),\n",
+       " Document(metadata={'file_path': 'more_arxiv/2501.00855v1.What_is_a_Social_Media_Bot__A_Global_Comparison_of_Bot_and_Human_Characteristics.pdf', 'page_number': 5, 'total_pages': 33, 'entry_id': 'http://arxiv.org/abs/2501.00855v1', 'title': 'What is a Social Media Bot? A Global Comparison of Bot and Human Characteristics', 'authors': ['Lynnette Hui Xian Ng', 'Kathleen M. Carley'], 'published': '2025-01-01', 'updated': '2025-01-01', 'primary_category': 'cs.CY', 'categories': ['cs.CY', 'cs.AI', 'cs.SI'], 'pdf_url': 'http://arxiv.org/pdf/2501.00855v1'}, text='Year Reference Definition\\n2018 US Department of\\nHomeland Security39\\n[...] Social Media Bots as programs that vary in size de-\\npending on their function, capability, and design; and can\\nbe used on social media platforms to do various useful\\nand malicious tasks while simulating human behavior\\n2024 Microsoft 74 Social media bots are automated programs designed to\\ninteract with account users.\\n2024 Meltwater 75 Refers to the definition by US CSIA (see below)\\nNot Dated CloudFlare 37 [...] social media bots are automated programs used to\\nengage in social media. These bots behave in an either\\npartially or fully autonomous fashion, and are often de-\\nsigned to mimic human users.\\nNot Dated Cybersecurity and In-\\nfrastructure Security\\nAgency (CISA)44\\nSocial Media Bots are automated programs that simu-\\nlate human engagement on social media platforms.\\nNote Dated Imperva 76 An Internet bot is a software application that runs auto-\\nmated tasks over the internet.\\nTable 3.Definitions of “Social Media Bot\" in industry literature\\nUser Content Interactions\\nReference Automation Mimicry Creation Distribution Communication Relationship\\n33 x x x\\n26 x x x x x\\n50 x\\n58 x\\n67 x x x\\n17 x\\n20 x x x x\\n38 x\\n18 x x\\n68 x\\n69 x\\n15 x x\\n41 x x x\\n70 x\\n71 x x\\n72 x x x\\n30 x\\n73 x x x\\nUS Department of\\nHomeland Security\\nx x\\nMicrosoft x x x\\nCloudFlare x x x x\\nCISA x x x x\\nImperva x\\nTable 4.Components of definitions of “Social media Bot\"\\n5/33'),\n",
+       " Document(metadata={'file_path': 'more_arxiv/2501.00855v1.What_is_a_Social_Media_Bot__A_Global_Comparison_of_Bot_and_Human_Characteristics.pdf', 'page_number': 6, 'total_pages': 33, 'entry_id': 'http://arxiv.org/abs/2501.00855v1', 'title': 'What is a Social Media Bot? A Global Comparison of Bot and Human Characteristics', 'authors': ['Lynnette Hui Xian Ng', 'Kathleen M. Carley'], 'published': '2025-01-01', 'updated': '2025-01-01', 'primary_category': 'cs.CY', 'categories': ['cs.CY', 'cs.AI', 'cs.SI'], 'pdf_url': 'http://arxiv.org/pdf/2501.00855v1'}, text='Results\\nWe perform a global comparison of bot and human characteristics by combining several datasets obtained from X (previously\\nnamed Twitter) using the Twitter V1 Developed API. These events are: Asian Elections 25, 34, Black Panther 77, Canadian\\nElections 201978, Captain Marvel79, Coronavirus,80 ReOpen America9, 80 and US Elections 202080. In total, these datasets\\ncontain ∼ 5 billion tweets and ∼ 200 million users. Each user in this database is labeled as bot or human using the BotHunter\\nalgorithm17.\\nHow many bots are there?\\nFigure 2 presents the percentage of bot users within each dataset. On average, the bot volume across the events are about 20%\\nwith the bot percentage spiking up to 43% during the US Elections. This is in line with past work, where a general sample of\\nusers usually reveal a bot percentage below 30%70, yet in a politically-charged topic (i.e. elections, tensions between countries),\\nthe bot percentage rises34, 80. Our estimate is also empirically consistent with Elon Musk’s estimate of 20%13. This finding is\\nimportant for event analysis, because it provides a comparison baseline towards the percentage of bot-like users within an event.\\nSpikes in bot user percentage beyond 20% suggest that the event and conversation has caught the interest of bot operators, and\\nthe analyst can monitor for signs of conversation manipulation.\\n19.7%\\n80.3%\\n43.9%\\n56.1%\\n18.5%\\n81.5%\\n18.4%\\n81.6%\\n20.4%\\n79.6%\\n17%\\n83%\\n15.7%\\n84.3%\\n21.9%\\n78.1%\\nAsian Elections\\nBlack Panther\\nCanadian Elections 2019\\nCaptain Marvel\\nCoronavirus 2020−2021\\nReOpen America\\nUS Elections 2020\\nOverall\\nDataset\\nPercentage\\nClass\\nBot\\nHuman\\nFigure 2.Comparison of Bot volume across events. The percentage of bot users across the events are on average around 20%.\\nHow do bots differ from humans linguistically?\\nWe extract psycholinguistic cues from the tweets using the NetMapper software 81. The software returns the count of each\\ncue in the sentence, i.e., the number of words belonging to the cue in the tweet. There are three categories of cues: semantic,\\nemotion and metadata. Semantic and emotion cues are derived from the tweet text, while metadata cues are derived from the\\nmetadata of the user. Semantic cues include: first person pronouns, second person pronouns, third person pronouns and reading\\ndifficulty. Emotion cues include: abusive terms, expletives, negative sentiment, positive sentiment. Metadata cues include:\\nthe use of mentions, media, URLs, hashtags, retweets, favorites, replies, quotes, and the number of followers, friends, tweets,\\ntweets per hour, time between tweets and friends:followers ratio.\\nFigure 3a presents the differences between cues used by bots and humans. The detailed numeric differences are in the\\nSupplementary Material. This difference is examined overall, and by event. There are consistent differences in the use of cues\\nby bots and humans. For example, across all events, bots use significantly more abusive terms and expletives, and tweet more\\nthan humans. On the other hand, humans use more first person pronouns, positive sentiment, and media (i.e., images, videos).\\nHumans tend to quote and reply to tweets, while bots tend to retweet.\\nMost events have consistent cue distribution, but some events look different. In general, humans use more sentiment cues.\\nHowever, in the two elections (US Elections 2020 and Canadian Elections 2019), bots used more sentiment cues. This reveals a\\ndeliberate attempt to use bots during the election seasons to polarize online sentiments. Prior research has shown that bots can\\nbe highly negative during the election season82, and that bots express hugely different sentiment sentiment when mentioning\\ndifferent political candidates8, 83.\\n6/33'),\n",
+       " Document(metadata={'file_path': 'more_arxiv/2501.00855v1.What_is_a_Social_Media_Bot__A_Global_Comparison_of_Bot_and_Human_Characteristics.pdf', 'page_number': 7, 'total_pages': 33, 'entry_id': 'http://arxiv.org/abs/2501.00855v1', 'title': 'What is a Social Media Bot? A Global Comparison of Bot and Human Characteristics', 'authors': ['Lynnette Hui Xian Ng', 'Kathleen M. Carley'], 'published': '2025-01-01', 'updated': '2025-01-01', 'primary_category': 'cs.CY', 'categories': ['cs.CY', 'cs.AI', 'cs.SI'], 'pdf_url': 'http://arxiv.org/pdf/2501.00855v1'}, text='Semantic Cues Emotion Cues Metadata Cues\\n(a) Differences in the use of psycholinguistic cues between bots and humans.\\nSemantic Cues Emotion Cues Metadata Cues\\n(b) Differences in the use of psycholinguistic cues between bots and humans for the combination of Captain\\nMarvel and Black Panther datasets. This compares the cue distribution with and without retweets.\\nFigure 3.Comparison of psycholinguistic overall cue usage (average cue usage per user) by bots and humans across datasets.\\nGreen cells show that humans use a larger number of the cue. Red cells show that bots use a larger number of the cue. *\\nindicates there is a significant difference in the usage of the cue between bots and humans.\\n7/33'),\n",
+       " Document(metadata={'file_path': 'more_arxiv/2501.00855v1.What_is_a_Social_Media_Bot__A_Global_Comparison_of_Bot_and_Human_Characteristics.pdf', 'page_number': 8, 'total_pages': 33, 'entry_id': 'http://arxiv.org/abs/2501.00855v1', 'title': 'What is a Social Media Bot? A Global Comparison of Bot and Human Characteristics', 'authors': ['Lynnette Hui Xian Ng', 'Kathleen M. Carley'], 'published': '2025-01-01', 'updated': '2025-01-01', 'primary_category': 'cs.CY', 'categories': ['cs.CY', 'cs.AI', 'cs.SI'], 'pdf_url': 'http://arxiv.org/pdf/2501.00855v1'}, text='When a bot retweets a human, its linguistic profile, by definition, is identical to the human’s. The question though, is\\nwhether the bots that are sending original tweets match the linguistic profile of those retweeting, or is the linguistic profile\\ndifferent? For the Black Panther and Captain Marvel events (Figure 3b), we compared the psycholinguistic profile for all tweets,\\nand the original tweets only (i.e., no retweets). In these two events, bots retweet significantly more than humans. However, the\\nbot-human difference between linguistic cue use of the original tweets vs all tweets are rather similar. Only the average tweet\\nreading difficulty and the number of friends are different: in original tweets, humans have higher values; in all tweets, bots have\\nhigher values. Therefore, bots have their unique signature when generating new content, but are guaranteed to match human’s\\ncontent when retweeting the human’s.\\nBots construct tweets with cues that can be easily and heavily automated, while humans construct more personal tweets that\\nrequire higher cognitive processing to create. Such differences shows how bot accounts still use rather rudimentary techniques:\\nhashtag latching using multiple hashtags 27, 40, connecting several users together with increased number of mentions 54 and\\nflooding the zone with lots of tweets tweets of their desired narratives 24, 84. More sophisticated communication techniques\\nlike having an increased number of media, and more advanced interaction techniques that involve dialogue understanding like\\nincreasing the number of replies and quotes, are still left to the humans. In short, bots have not entirely mimicked humans, yet.\\nHow do bots present themselves differently from humans?\\nSocial identity theory depicts how social media users portray their image online, and the community that they want to be\\nassociated with85, 86. We analyze the difference in the self-presentation of the identities between bots and humans, and the\\ndifference between the linguistic cues used by the identities. Across the events, there are consistently a smaller proportion of\\nbots that present with an identity. Overall, 21.4% of bots present an identity, while 27.0% of humans present an identity (see\\nAppendix Table 10). Bots are more likely to obfuscate their identities87, allowing them to take on different personas to suit their\\noperation requirements88. Figure 4a presents the top 25 identities by frequency between bots and humans. There is a more\\nexponential drop of the frequency of the use of identities in bot users than in human users, suggesting that bots concentrate their\\nself-presentation on certain identities, mostly the common ones: man, son, fan, lover; while humans have a more varied identity\\npresentation.\\nWe then ask a follow-up question: “How do the same bot/human identities talk about the same topics?\" We compare the use\\nof topic frames per identity for the most frequent identity affiliations in Figure 4b. This plots the percentage difference of the\\nuse of framing cues (Family, Gender, Political, Race/Nationality, Religion) between bots and humans. This metric compares\\nthe use of cues with the human usage as a baseline. Overall, bots converse more aggressively in all topic frames. In particular,\\nbots converse most around societal fault lines: gender, political, race/nationality. These conversations lie on societal fault\\nlines, which could sow discord and chaos89, therefore such bots are of interest to monitor and moderate. In fact, bots use more\\ngender-based cues. Other research groups have also identified that a disproportionate number of bots that spread disinformation\\nare females90, 91, and are thus more likely to use gender frames in their posts. Bots tend to converse largely about political\\ntopics, regardless of the identity they affiliate with, indicating that a good proportion of bots are deployed for political purposes,\\neither by political parties or by political pundits 16, 26, 68. Finally, the difference between the usage of topic frames between\\nbots and humans could be due to their vocabulary used. The words used by humans are more varied and mostly not standard\\ndictionary words, while bots are still being programmed with a limited set of vocabulary, as evidenced by the proportion of\\nwords identified by the dictionaries in the NetMapper program used. In a similar aspect, chat bot interactions have a more\\nlimited vocabulary than human interactions92.\\nFigure 4c presents the average use of topic frames by identity categories. Humans affiliate themselves equally with all\\nidentity categories, while bots generally affiliate themselves with racial and political identities. Both bots and humans converse\\na lot on gender and political issues.\\nBots converse mostly about topics that closely match their identity. For example, a bot that presents itself as “man\" and\\n“son\" mostly converse about family then gender; while bots that take on the identities “conservative\" and “american\" converse\\nsignificantly more about politics. This observation can be read from the heatmap: for the bots that associate with the religion\\nidentity, the average use of religious words is 0.04, while that for humans is 0.00. If the users associate with the family identity,\\nthe average proportion of the use of family words within the content is 0.19 for bots and 0.04 for humans. Such is the curated\\npresentation and programming of bot users, which allows for an aspect of predictability - if a bot user affiliates with a certain\\nidentity, it is likely to talk about topics related to its identity. This shows that bots are likely designed to look like humans. They\\nare strategically designed to be in character by having the right affiliation to fit in and converse with a specific group.\\nOur observations in the affiliation of identities by bots in their user description and the use of identity-related topic frames\\nmeans that bots are being used strategically. They are not just used to support or dismiss groups in general, but are specifically\\nbeing aimed at a gender (i.e., women or men), or at a political actor (i.e., president, governor, politician). Bots are overused in\\nthe political, religious, and racial realm, suggesting that they are targeting topics of societal tensions.\\n8/33'),\n",
+       " Document(metadata={'file_path': 'more_arxiv/2501.00855v1.What_is_a_Social_Media_Bot__A_Global_Comparison_of_Bot_and_Human_Characteristics.pdf', 'page_number': 9, 'total_pages': 33, 'entry_id': 'http://arxiv.org/abs/2501.00855v1', 'title': 'What is a Social Media Bot? A Global Comparison of Bot and Human Characteristics', 'authors': ['Lynnette Hui Xian Ng', 'Kathleen M. Carley'], 'published': '2025-01-01', 'updated': '2025-01-01', 'primary_category': 'cs.CY', 'categories': ['cs.CY', 'cs.AI', 'cs.SI'], 'pdf_url': 'http://arxiv.org/pdf/2501.00855v1'}, text='(a) Comparison of the use of the identity affiliations by\\nbots and humans. 21% of the users affiliate with an identity\\nin their user description.\\nFrame: Race/Nationality Frame: Religion\\nFrame: Family Frame: Gender Frame: Political\\n−9\\n−6\\n−3\\n0\\n−9\\n−6\\n−3\\n0\\n−9\\n−6\\n−3\\n0\\nally\\nartist\\nass\\nblack\\nfan\\nfriend\\ngirl\\ngod\\nlover\\nman\\nperson\\nrat\\nson\\nstar\\nstudent\\nwriter\\nally\\nartist\\nass\\nblack\\nfan\\nfriend\\ngirl\\ngod\\nlover\\nman\\nperson\\nrat\\nson\\nstar\\nstudent\\nwriter\\nally\\nartist\\nass\\nblack\\nfan\\nfriend\\ngirl\\ngod\\nlover\\nman\\nperson\\nrat\\nson\\nstar\\nstudent\\nwriter\\nally\\nartist\\nass\\nblack\\nfan\\nfriend\\ngirl\\ngod\\nlover\\nman\\nperson\\nrat\\nson\\nstar\\nstudent\\nwriter\\nally\\nartist\\nass\\nblack\\nfan\\nfriend\\ngirl\\ngod\\nlover\\nman\\nperson\\nrat\\nson\\nstar\\nstudent\\nwriter\\nPercentage Difference\\n(b) Percentage Difference (H−B)\\nH of the use of each topic frame in messages by\\nthe top frequent identity affiliations.\\n0.19 0.31 0.30 0.17 0.04\\n0.18 0.31 0.27 0.15 0.03\\n0.17 0.29 0.26 0.15 0.03\\n0.16 0.28 0.24 0.14 0.03\\n0.19 0.31 0.35 0.19 0.04\\n0.17 0.29 0.33 0.19 0.03\\n0.16 0.29 0.30 0.16 0.04\\n1.22 2.09 2.04 1.16 0.24\\n1.00\\n0.94\\n0.91\\n0.85\\n1.08\\n1.01\\n0.95\\n6.75\\nfamily\\ngender\\njob\\nother\\npolitical\\nrace_nationality\\nreligion\\ntotal\\navg_familyavg_gender avg_political\\navg_race_nationality\\navg_religion\\ntotal\\n0.00 0.10 0.20 0.30 0.40\\nBots\\n0.04 0.07 0.07 0.02 0.00\\n0.05 0.08 0.06 0.02 0.00\\n0.05 0.08 0.07 0.03 0.01\\n0.05 0.08 0.06 0.02 0.01\\n0.05 0.08 0.08 0.03 0.01\\n0.06 0.08 0.10 0.04 0.00\\n0.04 0.09 0.09 0.03 0.01\\n0.34 0.56 0.54 0.19 0.04\\n0.20\\n0.22\\n0.24\\n0.22\\n0.26\\n0.27\\n0.26\\n1.67\\nfamily\\ngender\\njob\\nother\\npolitical\\nrace_nationality\\nreligion\\ntotal\\navg_familyavg_gender avg_political\\navg_race_nationality\\navg_religion\\ntotal\\n0.00 0.10 0.20 0.30 0.40\\nHumans\\nT opic Frame\\nIdentity Category\\n(c) Average use of topic frames by identity category referred to. Bots are more likely to refer to gender and\\npolitical identities, and are more likely to utilize racially typed language.\\nFigure 4.Comparison of identity-related behaviors in bots and humans\\nHow do bots communicate differently from humans?\\nSocial interactions between users are an indication of the information dissemination patterns and the communication strategies of\\nthe users. We calculate the network metrics (total degree, in degree, out degree, density) of the all-communication ego-networks\\nof the users. In the network graphs, the users are nodes, and the links between users represent all communications between the\\ntwo users (i.e., replies, quotes, mentions, retweets). Table 5 compares the two metrics for bots and humans. Bot ego networks\\n9/33'),\n",
+       " Document(metadata={'file_path': 'more_arxiv/2501.00855v1.What_is_a_Social_Media_Bot__A_Global_Comparison_of_Bot_and_Human_Characteristics.pdf', 'page_number': 10, 'total_pages': 33, 'entry_id': 'http://arxiv.org/abs/2501.00855v1', 'title': 'What is a Social Media Bot? A Global Comparison of Bot and Human Characteristics', 'authors': ['Lynnette Hui Xian Ng', 'Kathleen M. Carley'], 'published': '2025-01-01', 'updated': '2025-01-01', 'primary_category': 'cs.CY', 'categories': ['cs.CY', 'cs.AI', 'cs.SI'], 'pdf_url': 'http://arxiv.org/pdf/2501.00855v1'}, text='have higher density than ego networks (8.33% more dense), which reflects that the bots have tighter communication structures\\nand form more direct interactions than humans. On average, a bot has 9.66% bot alters and 90.34% human alters, whereas on\\naverage a human has 7.31% bot alters and 92.69% human alters. Although bots interact with a higher proportion of bot alters\\nthan humans do (32% more bot alters), our findings show that both bots and humans interact more with humans rather than bots\\nin their ego network. By the principle of homophily, it is natural for humans to interact with other humans93. However, bots\\nviolate the principle of homophily, and instead of interacting with more bots, they interact with more humans. Therefore, bots\\nare actively forming communication interactions with humans, perhaps attempting to influence humans94.\\nBot Humans\\nIn-degree 0.05 ± 0.08 0.02 ± 0.02\\nOut-degree 8E-4 ± 1.4E-3 1.6E-3 ± 3.3E-3\\nTotal degree 0.15 ± 0.09 0.16 ± 0.11\\nDensity 0.35 ± 0.06 0.034 ± 0.06\\n% bot alters 9.66 ± 2.98 7.31 ± 3.10\\nTable 5.Comparison of network metrics. For the in-degree, out-degree, total degree and density, we present the ratio of\\nmean(metric) for agent type : max(metric) across all agents in the event\\nFigure 5 shows the interaction of bots and humans in a network diagram. These users are illustrative of the most frequent\\ncommunicators in the Asian Elections dataset. In this diagram, users are represented as nodes, and links between users represent\\na communication (e.g. a retweet, reply, mention). The network diagrams presented are one- and two-degree ego-networks,\\ngenerated by the ORA software81. This means that the networks present users that are in direct communication with the user\\n(1-degree), and are in direct communication with the 1st-degree users (2-degree).\\nFigure 5.Ego network structures of Bots and Humans who are the most frequent communicators in the Asian Elections\\ndataset. Nodes represent social media users. Links between users represent a communication relationship between the two\\nusers (i.e., retweet, mention). Bot users are colored in red, human users in grey. The width of the links represent the extent of\\ninteractions between the two users. In these most frequent communicators, bots have a star network structure, and humans a\\ntree structure. Bot networks have more bot alters, while human networks have more human alters.\\nA common way for bots to be used in political discourse (e.g. elections) is to amplify other users. As an amplifier, bots are\\n10/33'),\n",
+       " Document(metadata={'file_path': 'more_arxiv/2501.00855v1.What_is_a_Social_Media_Bot__A_Global_Comparison_of_Bot_and_Human_Characteristics.pdf', 'page_number': 11, 'total_pages': 33, 'entry_id': 'http://arxiv.org/abs/2501.00855v1', 'title': 'What is a Social Media Bot? A Global Comparison of Bot and Human Characteristics', 'authors': ['Lynnette Hui Xian Ng', 'Kathleen M. Carley'], 'published': '2025-01-01', 'updated': '2025-01-01', 'primary_category': 'cs.CY', 'categories': ['cs.CY', 'cs.AI', 'cs.SI'], 'pdf_url': 'http://arxiv.org/pdf/2501.00855v1'}, text='the pendants of the user they are amplifying. Therefore, bots appear in star networks in many of the peripheral nodes. A star\\nstructure is a network that have a strongly connected core and peripheral networks. This structure is most prominent in bots in\\npolitical discourse, where core bots create information, and peripheral bots amplify the information through the retweeting\\nmechanic24. Humans, on the other hand, are more likely to be part of a tree structure, where one can make out the tiered\\nfirst- and second-hop interactions. In the same discourse, humans are more likely to be performing many actions, sometimes\\nretweeting other users, sometimes tagging other users and so forth.\\nThis difference in interactions between bot and human users reveals the communication patterns of both user classes. The\\nstar structure of bots suggests that they have a hierarchy of interconnected bot users in an operation network to disseminate\\ninformation, which is easily achieved with the help of automation. On the other hand, humans communicate predominantly\\nwithin their immediate network before extending their communication outwards. The bot ego networks are more dense,\\nsignifying that they were constructed to interact more than do humans, and are sometimes constructed as networks of bots (the\\nbotnet)94, 95.\\nDiscussion\\nThrough our large scale empirical study, we show that bots and humans have interesting and consistent differences between\\nthem. These differences span from their volume, to the linguistic features of their text, to the identities they affiliate with, to\\ntheir social network structure. These features can be used to characterize a social media bot, and how it differs from humans.\\nWe study a huge amount of data dated from 2018 to 2021. These data show consistent differences over the years, which means\\nthat while bot technology do evolve, it does not evolve drastically. Moreover, the consistent differences show that there are\\nscenarios where bots can be better than humans, and scenarios where humans can be better than bots. These differences provide\\ninsights to how both can be utilized to afford conversations on social media: bots can be used for methodological postings with\\ndeliberate selection of hashtags, tweets per hour, and a structured star communication network. Humans can be used for more\\ncomplex cognitive tasks such as adding media to a post or replying to a post, and for conversing on a larger range of topics3.\\nStrengths of Social Media Bots\\nAn Artificial Intelligent (AI) system is a machine, or computer system, that can perceive its environment and use intelligence to\\nperform actions to achieve defined goals96. The social media bot perceives the digital environment to decide their targets (i.e.,\\nusers to retweet, users to mention), and intelligently carry out content and interaction mechanics to achieve their goals (i.e.,\\nspreading information28, 56, sowing discord97, 98, assisting the community45–47). The software-programmed social media bot is\\nan AI algorithm, and thus has potential to be harnessed for social good.\\nTable 6 lists some recommendations of our results on how bots can be leveraged on for social good, and how they can be\\nregulated. First, given that bots use more retweets and mentions than humans, and have high tweets per hour, bots can be used\\nfor menial tasks like announcements and distribution of information. Second, since bots have a star interaction network, they\\ncan be used for big announcements like disaster and crisis management without message distortion. A star network sends\\nmessages directly through interactions, hence the original message is preserved. However, the human’s hierarchal interaction\\nnetwork will distort the message as it passes through the tiers. Third, bots typically post content that matches their identity,\\nthey can be used to provide educational material about topics that people associate with certain profession. For example, a\\nweather news bot can provide weather information. Lastly, since bots use more abusive and expletive terms than humans,\\ninstead of regulating toxic language itself, regulation can be focused on disallowing bots to use such toxic language, which\\nwould therefore reduce the amount of hyperbole and offense online.\\nResult Recommendation\\nBots use a lot of retweets and mentions, and have\\nhigh tweets per hour\\nUse bots for menial tasks like announcements and amplification of\\nannouncements\\nBots have a star interaction network Use bots for big announcements (e.g., disaster, crisis management) with-\\nout message distortion\\nBot content matches identity Use bots to provide educational material about topics that people asso-\\nciate with certain professions (e.g. weather information from a weather\\nnews bot)\\nBots use more abusive and expletive terms than\\nhumans\\nFocus regulation to disallow bots to use toxic language\\nTable 6.Recommendations of our observations on leveraging and regulating bots for social good\\n11/33'),\n",
+       " Document(metadata={'file_path': 'more_arxiv/2501.00855v1.What_is_a_Social_Media_Bot__A_Global_Comparison_of_Bot_and_Human_Characteristics.pdf', 'page_number': 12, 'total_pages': 33, 'entry_id': 'http://arxiv.org/abs/2501.00855v1', 'title': 'What is a Social Media Bot? A Global Comparison of Bot and Human Characteristics', 'authors': ['Lynnette Hui Xian Ng', 'Kathleen M. Carley'], 'published': '2025-01-01', 'updated': '2025-01-01', 'primary_category': 'cs.CY', 'categories': ['cs.CY', 'cs.AI', 'cs.SI'], 'pdf_url': 'http://arxiv.org/pdf/2501.00855v1'}, text='Challenges and Opportunities of studying Social Media Bots\\nNext, we elaborate on three challenges in the study of social media bot, and discuss some opportunities for future research.\\nDetect The first step to bot detection is to systematically detect these bots. However, these automated agents are constantly\\nevolving and adapting their behavior in response to the changing setup of social media platforms and user patterns. The\\nstricter data collection rules of social media platforms99, 100 and the increasing usage of AI in these bot agents73 creates further\\nvariability in these digital spaces bots reside in. This therefore muddles any developed algorithms based on previous datasets.\\nAlready, linguistic differences between bot and human tweets have narrowed between 2017 and 2020, making bot accounts\\nmore difficult to systematically differentiate19. More recently, AI-powered botnets have emerged, using ChatGPT models to\\ngenerate human-like content73, closing the gap between bot and human.\\nBot evolution and bot detection are thus a “never-ending clash\"101, and sometimes bot accounts evolve faster than current\\nknown bot detection algorithms68, presenting several opportunities in continual improvement of bot detection algorithms,\\nspecifically to be adaptable, faster, and more efficient. The increasing trends of using Large Language Models and Large Vision\\nModels to create generated texts and deepfakes lend bots a helping hand in the construction of more believable narratives.\\nThese same generative technology are also used to construct offensive bots for humor102. However, current trends reflect that\\nthe use of such technologies are not very prevalent, for example,73 only found one set of such botnet in their study, reflecting\\nthat bots are still relying on traditional techniques, likely because such heuristic-based techniques are easier and faster to deploy\\nen masse.\\nDifferentiate After identifying which users are likely to be bots, one must differentiate the goodness of the bot and its function.\\nThis evaluation can be inferred from the bot’s content postings and relationship interactions. However, bots do not fall squarely\\nin a spectrum of goodness; the lines of good and bad bots are blurred. In fact, bots can move between neutral in which they\\npost messages that are not harmful, to bad, where they post politically charged and extremist messages40, 103. Herein lies an\\nopportunity to construct a rubric to determine the goodness of the bot; this, though, is a complex task, for there are ethical and\\nsocietal issues to consider. Bots can change their goodness, too. They may be supporting a certain cause initially, then making\\na swing to a different stance soon enough. This swing of support was witnessed during the coronavirus pandemic era, and\\nespecially so when the bots require little conviction to change allegiances8. Another challenge involves identifying the type\\nof bot, which can provide insight towards possible impact of the bot. For example, an Amplifier Bot that intensifies political\\nmessages could be intended to sow discord24, 61.\\nDisrupt The third challenge is to mindfully disrupt the operations of bot users. That is, moderating the impact of malicious\\nbots, while not unsettling human conversations. While banning bot users can be an easy solution, a blanket ban can result in\\nmany false positives, which thus results in humans being identified as bots and being banned. Such situations can result in\\nemotional or psychological harm of the human being banned, or toxic online behavior where users repeatedly report another\\nuser that they personally dislike as a bot to silence them104. Additionally, social media bots do not necessarily work alone: they\\ncoordinate with other bots – sometimes even human agents – to push out their agenda80, and therefore if one agent warrants a\\nban, should the entire network be banned? To ban an entire network may entangle several unsuspecting humans who have been\\ninfluenced by the bots to partake in the conversation. With these considerations in mind, regulation is a scope of problem with\\nwhich to be studied: which types of bots should we ban? What are the activities of a bot that would warrant a ban?\\nMethods\\nExamining Bot Literature\\nWe examined recent bot literature for the definition of “social media bot\". For academic definitions, we searched the phrase\\n“social media bot\" on Google Scholar. For industry literature, we searched the phrase “social media bot\" on Google Search.\\nThen, we manually scanned through the results. We picked out the more relevant and highly cited papers that had a definition of\\na social media bot. We read through each paper, and manually extracted the definition of a social media bot stated in the paper.\\nNext, we looked through all the definitions and picked out key phrases. We then harmonized the phrases and definitions to\\ncreate a general definition of the bot. All authors agreed on the definitions and categorizations.\\nData Collection and Labeling\\nWe collected a dataset from Twitter/X involving global events which provides a richness in a general understanding of the bot\\nand human differentiation. The list of data collection parameters are detailed in Appendix Table 8.\\nWe labeled each user in this dataset as bot or human with the BotHunter algorithm. This algorithm uses a tiered random\\nforest classifier with increasing amounts of user data to evaluate the probability of the user being a bot. The algorithm returns\\na bot probability score that is between 0 and 1, where scores above 0.7 we deem as a bot, and scores below 0.7 we deem\\nas a human. This 0.7 threshold value is determined from a previous longitudinal study that sought to identify a stable bot\\n12/33'),\n",
+       " Document(metadata={'file_path': 'more_arxiv/2501.00855v1.What_is_a_Social_Media_Bot__A_Global_Comparison_of_Bot_and_Human_Characteristics.pdf', 'page_number': 13, 'total_pages': 33, 'entry_id': 'http://arxiv.org/abs/2501.00855v1', 'title': 'What is a Social Media Bot? A Global Comparison of Bot and Human Characteristics', 'authors': ['Lynnette Hui Xian Ng', 'Kathleen M. Carley'], 'published': '2025-01-01', 'updated': '2025-01-01', 'primary_category': 'cs.CY', 'categories': ['cs.CY', 'cs.AI', 'cs.SI'], 'pdf_url': 'http://arxiv.org/pdf/2501.00855v1'}, text='score threshold that best represents the automation capacity of a user69. This bot algorithm and threshold is chosen so that our\\nstudies will be consistent with the original studies of the dataset that used the BotHunter algorithm9, 34, 77–80. We calculated the\\nproportion of bot users against the total number of users within each event. Our results are presented in a bar graph.\\nComparison by Psycholinguistic Cues We parse the collected dataset of tweets through the NetMapper software 81 to\\nextract out psycholinguistic cues of the texts. NetMapper extracts the number of each of the cues per tweet. The software\\nreturns three types of cues: semantic cues, emotion cues and metadata cues. The linguistic cues are returned by matching words\\nagainst a dictionary for each category. The dictionary has words in 40 languages. Then, for each user, we average the use of\\neach cue per category, as the trend for the user. We then perform a student t-test comparison between the cues of each user type\\nwith Bonferroni correction, and identify whether the cues are significantly different between the bot and human at the p < 0.05\\nlevel. We then remove the retweets from the Captain Marvel and Black Panther datasets and compare the cue distribution of\\noriginal tweets with all tweets. This analysis compares the differences in the distribution of cues of tweets originating from the\\nuser type and their retweets.\\nComparison by Self-Presentation of IdentityTo classify identities, we compare the user description and bio information\\nagainst a survey of occupations of United States users performed in 2015105. If the occupation is present in the user information,\\nthe user is tagged with the identity. A user can have more than one identity. We compare the top identities used by bots and\\nhuman users across all events. These identities are also divided up into seven categories: religion, race/nationality, political, job,\\ngender, family and others. We then classify each user into these categories of identities. Again, each user can fall into multiple\\ncategories.\\nNext, we examined how different identities frame their posts differently. We extract framing cues from the overall set of of\\npsycholinguistic cues generated. The topic frames we examined are: family, gender, political, race/nationality and religion. For\\neach most frequent identity affiliated with by bots and humans, we compare the difference in the average use of each topic\\nframe through a percentage difference calculation. The percentage difference in the use of framing cues is calculated as: (H−B)\\nH ,\\nwhere H is the average use of the framing cue by humans, and B is the average use of framing cue by bots. This comparison\\ntells us how much more bots use a framing cue as compared to humans. If the percentage is negative, bots use the framing cue\\nmore than humans. If the percentage is positive, bots use the cue less than humans.\\nThe set of topic frames also corresponds with the identity categories. Therefore, we also compared the identity categories\\nagainst the average use of each topic frame. This comparison is performed across bots and humans. We plot heatmaps to show\\nthe relationship between the average use of each topic frame topic frame against the identity categories.\\nComparison by Social Interactions We construct the all-communication ego-networks of the users in our dataset. We\\nanalyzed all the users for Asian Elections, Black Panther, Canadian Elections 2019, Captain Marvel and ReOpen America\\nevents. Due to the size of the data, we analyzed a 2% sample of users of the Coronavirus2020-2021 users (N = 4.6mil), and a\\n50% sample of users from the US Elections 2020 (N = 500k). The ego-networks are network graphs of the bot and human users\\nin focus. In the networks, each user is represented as a node, and a communication interaction between users are represented as\\nlinks. The ego-networks are constructed using all-communication interactions, that is any communication between users (i.e.,\\nretweet, @mentions, reply, quote) is reflected as a link. We analyzed the network properties of the ego-networks constructed\\nper event. These properties are: total-degree, in degree, out degree, density. We also analyzed the number of bot and human\\nalters there are in the ego networks. No pre-processing were performed on the networks prior to the calculations. We used the\\nORA software to load in the networks and perform the calculations81. We finally visualize the network graphs of one- and two-\\ndegree ego networks of a sample of bots and humans Figure 5. These are the 20 most frequent communicators in the Asian\\nElections sub-dataset A 1-degree network shows alters (connected users) that are in direct communication with the user, and a\\n2-degree network shows alters in direct communication with the 1st-degree alters.\\nConclusion\\nSocial media bots are deeply interweaved into our digital ecosystem. More than half of the Internet traffic in 2023 were\\ngenerated by these AI agents106. Bots are able to generate this volume of traffic because of their use of automation, which\\nenables them to create more content and form more relationships. This article surmised a definition of a social media bot\\nbased on the three elements that a social media platform contains: user, content, interactions. Our definition breaks down\\nthe automation on social media platforms into its core mechanics, and therefore provide the foundation for further research,\\nanalysis and policies regulating the digital space. We performed a large scale data analysis of bot and human characteristics\\nacross events around the globe, presenting the uniqueness of the bot species from a macro perspective: how bots and humans\\ndiffer in terms of the use of linguistic cues, social identity affiliations and social interactions. On a global scale, bots and\\nhumans do have consistent differences, which can be used to differentiate the two species of users. Table 7 summarizes the\\ndifferences between bots and humans as a conclusive remark. Finally, we provide recommendations for the use and regulation\\n13/33'),\n",
+       " Document(metadata={'file_path': 'more_arxiv/2501.00855v1.What_is_a_Social_Media_Bot__A_Global_Comparison_of_Bot_and_Human_Characteristics.pdf', 'page_number': 14, 'total_pages': 33, 'entry_id': 'http://arxiv.org/abs/2501.00855v1', 'title': 'What is a Social Media Bot? A Global Comparison of Bot and Human Characteristics', 'authors': ['Lynnette Hui Xian Ng', 'Kathleen M. Carley'], 'published': '2025-01-01', 'updated': '2025-01-01', 'primary_category': 'cs.CY', 'categories': ['cs.CY', 'cs.AI', 'cs.SI'], 'pdf_url': 'http://arxiv.org/pdf/2501.00855v1'}, text='of bots. These recommendations are informed by our results. We also lay out the challenges and opportunities for the future of\\nbot detection in a “Detect, Differentiate, Disrupt\" frame. We invite academics, non-profits and policymakers to take part in this\\nactive research area.\\nBots Humans\\nVolume (%) 21.9 ± 9.8 78.1 ± 9.8\\nPsycholinguistic Cues Use more hashtags, mentions; has\\nmore tweets/hour, total tweets,\\nfriends:followers ratio\\nuses more media, favorites, replies,\\nquotes, urls\\nSelf-presentation of Identity Concentrate their affiliations on a\\nfew identities\\nHave a more varied identity affilia-\\ntions\\nHave identity affiliation (%) 21.4±5.7 27.0 ±9.2\\nTopic Frames Political topics Family and Gender\\nIdentity vs Topic Frames Converse about topics that closely\\nmatch their identity\\nHave a larger range of topics\\nSocial Interactions Star communication structure Tiered communication structure\\nDenser interaction networks Less dense interaction networks\\nInteract with more human than bot\\nalters\\nInteract with more human than bot\\nalters\\nTable 7.Summary of Differences between Bots and Humans.\\nReferences\\n1. Woolley, S. C. Automating power: Social bot interference in global politics. First Monday (2016).\\n2. Lotan, G. et al. The arab spring| the revolutions were tweeted: Information flows during the 2011 tunisian and egyptian\\nrevolutions. Int. journal communication 5, 31 (2011).\\n3. Ng, L. H. X., Robertson, D. C. & Carley, K. M. Cyborgs for strategic communication on social media. Big Data & Soc.\\n11, 20539517241231275 (2024).\\n4. Ng, L. H. X. & Carley, K. M. Assembling a multi-platform ensemble social bot detector with applications to us 2020\\nelections. Soc. Netw. Analysis Min. 14, 45 (2024).\\n5. Chang, H.-C. H., Chen, E., Zhang, M., Muric, G. & Ferrara, E. Social bots and social media manipulation in 2020: The\\nyear in review. In Handbook of Computational Social Science, Volume 1, 304–323 (Routledge, 2021).\\n6. Seckin, O. C., Atalay, A., Otenen, E., Duygu, U. & Varol, O. Mechanisms driving online vaccine debate during the\\ncovid-19 pandemic. Soc. Media+ Soc. 10, 20563051241229657 (2024).\\n7. Ferrara, E. What types of covid-19 conspiracies are populated by twitter bots? arXiv preprint arXiv:2004.09531 (2020).\\n8. Ng, L. H. X. & Carley, K. M. Pro or anti? a social influence model of online stance flipping. IEEE Transactions on Netw.\\nSci. Eng. 10, 3–19 (2022).\\n9. Magelinski, T., Ng, L. H. X. & Carley, K. M. A synchronized action framework for responsible detection of coordination\\non social media. arXiv preprint arXiv:2105.07454 (2021).\\n10. Broniatowski, D. A. et al. Weaponized health communication: Twitter bots and russian trolls amplify the vaccine debate.\\nAm. journal public health 108, 1378–1384 (2018).\\n11. Shao, C. et al. The spread of low-credibility content by social bots. Nat. communications 9, 1–9 (2018).\\n12. Ng, L. H. X., Cruickshank, I. J. & Carley, K. M. Cross-platform information spread during the january 6th capitol riots.\\nSoc. Netw. Analysis Min. 12, 133 (2022).\\n13. Ingram, M. Musk’s Twitter bid, and the ‘bot’ complication. https://www.cjr.org/the_media_today/\\nmusks-twitter-bid-and-the-bot-complication.php (2022). [Accessed 28-10-2024].\\n14. Childs, J. Elon Musk says X is fighting bots and spam, and the solution is: $1 subscriptions — latimes.com. https://\\nwww.latimes.com/business/story/2023-10-18/x-pilot-program-to-charge-1-a-year-in-effort-to-combat-bots-spam (2023).\\n[Accessed 28-10-2024].\\n14/33'),\n",
+       " Document(metadata={'file_path': 'more_arxiv/2501.00855v1.What_is_a_Social_Media_Bot__A_Global_Comparison_of_Bot_and_Human_Characteristics.pdf', 'page_number': 15, 'total_pages': 33, 'entry_id': 'http://arxiv.org/abs/2501.00855v1', 'title': 'What is a Social Media Bot? A Global Comparison of Bot and Human Characteristics', 'authors': ['Lynnette Hui Xian Ng', 'Kathleen M. Carley'], 'published': '2025-01-01', 'updated': '2025-01-01', 'primary_category': 'cs.CY', 'categories': ['cs.CY', 'cs.AI', 'cs.SI'], 'pdf_url': 'http://arxiv.org/pdf/2501.00855v1'}, text='15. Ellaky, Z., Benabbou, F. & Ouahabi, S. Systematic literature review of social media bots detection systems. J. King Saud\\nUniv. Inf. Sci. 35, 101551 (2023).\\n16. Ng, L. H. X., Bartulovic, M. & Carley, K. M. Tiny-botbuster: Identifying automated political coordination in digital\\ncampaigns. In International Conference on Social Computing, Behavioral-Cultural Modeling and Prediction and\\nBehavior Representation in Modeling and Simulation, 25–34 (Springer, 2024).\\n17. Beskow, D. M. & Carley, K. M. Bot-hunter: a tiered approach to detecting & characterizing automated activity on twitter.\\nIn Conference paper. SBP-BRiMS: International conference on social computing, behavioral-cultural modeling and\\nprediction and behavior representation in modeling and simulation, vol. 3 (2018).\\n18. Sayyadiharikandeh, M., Varol, O., Yang, K.-C., Flammini, A. & Menczer, F. Detection of novel social bots by ensembles\\nof specialized classifiers. In Proceedings of the 29th ACM international conference on information & knowledge\\nmanagement, 2725–2732 (2020).\\n19. Ng, L. H. X. & Carley, K. M. Botbuster: Multi-platform bot detection using a mixture of experts. In Proceedings of the\\ninternational AAAI conference on web and social media, vol. 17, 686–697 (2023).\\n20. Orabi, M., Mouheb, D., Al Aghbari, Z. & Kamel, I. Detection of bots in social media: a systematic review. Inf. Process.\\n& Manag. 57, 102250 (2020).\\n21. Kolomeets, M., Chechulin, A. & Kotenko, I. V . Bot detection by friends graph in social networks.J. Wirel. Mob. Networks\\nUbiquitous Comput. Dependable Appl. 12, 141–159 (2021).\\n22. Li, S. et al. Botfinder: a novel framework for social bots detection in online social networks based on graph embedding\\nand community detection. World Wide Web26, 1793–1809 (2023).\\n23. Feng, S. et al. What does the bot say? opportunities and risks of large language models in social media bot detection.\\narXiv preprint arXiv:2402.00371 (2024).\\n24. Jacobs, C. S., Ng, L. H. X. & Carley, K. M. Tracking china’s cross-strait bot networks against taiwan. In International\\nconference on social computing, behavioral-cultural modeling and prediction and behavior representation in modeling\\nand simulation, 115–125 (Springer, 2023).\\n25. Uyheng, J. & Carley, K. M. Bot impacts on public sentiment and community structures: Comparative analysis of three\\nelections in the asia-pacific. In Social, Cultural, and Behavioral Modeling: 13th International Conference, SBP-BRiMS\\n2020, Washington, DC, USA, October 18–21, 2020, Proceedings 13, 12–22 (Springer, 2020).\\n26. Bessi, A. & Ferrara, E. Social bots distort the 2016 us presidential election online discussion. First monday 21 (2016).\\n27. Khaund, T., Al-Khateeb, S., Tokdemir, S. & Agarwal, N. Analyzing social bots and their coordination during natural\\ndisasters. In Social, Cultural, and Behavioral Modeling: 11th International Conference, SBP-BRiMS 2018, Washington,\\nDC, USA, July 10-13, 2018, Proceedings 11, 207–212 (Springer, 2018).\\n28. Ng, L. H. & Taeihagh, A. How does fake news spread? understanding pathways of disinformation spread through apis.\\nPolicy & Internet 13, 560–585 (2021).\\n29. Hajli, N., Saeed, U., Tajvidi, M. & Shirazi, F. Social bots and the spread of disinformation in social media: the challenges\\nof artificial intelligence. Br. J. Manag. 33, 1238–1253 (2022).\\n30. Kenny, R., Fischhoff, B., Davis, A., Carley, K. M. & Canfield, C. Duped by bots: why some are better than others at\\ndetecting fake social media personas. Hum. factors 66, 88–102 (2024).\\n31. Kolomeets, M., Tushkanova, O., Desnitsky, V ., Vitkova, L. & Chechulin, A. Experimental evaluation: Can humans\\nrecognise social media bots? Big Data Cogn. Comput. 8, 24 (2024).\\n32. Carley, K. M. Social cybersecurity: an emerging science. Comput. mathematical organization theory 26, 365–381 (2020).\\n33. Ferrara, E., Varol, O., Davis, C., Menczer, F. & Flammini, A. The rise of social bots. Commun. ACM 59, 96–104 (2016).\\n34. Uyheng, J., Ng, L. H. X. & Carley, K. M. Active, aggressive, but to little avail: characterizing bot activity during the 2020\\nsingaporean elections. Comput. Math. Organ. Theory 27, 324–342 (2021).\\n35. Himelein-Wachowiak, M. et al. Bots and misinformation spread on social media: Implications for covid-19. J. medical\\nInternet research 23, e26933 (2021).\\n36. Ng, L. H. X., Zhou, W. & Carley, K. M. Exploring cognitive bias triggers in covid-19 misinformation tweets: A bot vs.\\nhuman perspective. arXiv preprint arXiv:2406.07293 (2024).\\n37. Cloudflare. What is a social media bot? | social media bot definition. https://www.cloudflare.com/learning/bots/\\nwhat-is-a-social-media-bot/. [Accessed 28-10-2024].\\n15/33')]"
       ]
      },
-     "execution_count": 12,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
diff --git a/src/floki/document/fetcher/arxiv.py b/src/floki/document/fetcher/arxiv.py
index 5725574..adfd574 100644
--- a/src/floki/document/fetcher/arxiv.py
+++ b/src/floki/document/fetcher/arxiv.py
@@ -32,6 +32,7 @@ def search(
         to_date: Union[str, datetime, None] = None,
         download: bool = False,
         dirpath: Path = Path("./"),
+        include_summary: bool = False,
         **kwargs
     ) -> Union[List[Dict], List["Document"]]:
         """
@@ -43,6 +44,7 @@ def search(
             to_date (Union[str, datetime, None]): End date for the search in 'YYYYMMDD' format or as a datetime object.
             download (bool): Whether to download the papers as PDFs.
             dirpath (Path): Directory path for the downloads (used if download=True).
+            include_summary (bool): Whether to include the paper summary in the returned metadata or documents. Defaults to False.
             **kwargs: Additional search parameters (e.g., sort_by).
 
         Returns:
@@ -85,21 +87,15 @@ def search(
         results = list(search.results())
         logger.info(f"Found {len(results)} results for query: {query}")
 
-        if download:
-            metadata_list = []
-            for result in results:
-                file_path = self._download_result(result, dirpath)
-                metadata_list.append(self._format_result_metadata(result, file_path=file_path))
-            return metadata_list
-        else:
-            documents = []
-            for result in results:
-                metadata = self._format_result_metadata(result)
-                text = result.summary.strip()
-                documents.append(Document(text=text, metadata=metadata))
-            return documents
+        return self._process_results(results, download, dirpath, include_summary)
 
-    def search_by_id(self, content_id: str, download: bool = False, dirpath: Path = Path("./")) -> Union[Optional[Dict], Optional[Document]]:
+    def search_by_id(
+        self,
+        content_id: str,
+        download: bool = False,
+        dirpath: Path = Path("./"),
+        include_summary: bool = False
+    ) -> Union[Optional[Dict], Optional[Document]]:
         """
         Search for a specific paper by its arXiv ID and optionally download it.
 
@@ -107,10 +103,19 @@ def search_by_id(self, content_id: str, download: bool = False, dirpath: Path =
             content_id (str): The arXiv ID of the paper.
             download (bool): Whether to download the paper.
             dirpath (Path): Directory path for the download (used if download=True).
+            include_summary (bool): Whether to include the paper summary in the returned metadata or document. Defaults to False.
 
         Returns:
             Union[Optional[Dict], Optional[Document]]: Metadata dictionary if `download=True`,
             otherwise a `Document` object.
+
+        Examples:
+            >>> fetcher = ArxivFetcher()
+            >>> fetcher.search_by_id("1234.5678")
+            # Searches for the paper with arXiv ID "1234.5678".
+
+            >>> fetcher.search_by_id("1234.5678", download=True, dirpath=Path("./downloads"))
+            # Searches for the paper with arXiv ID "1234.5678" and downloads it to "./downloads".
         """
         logger.info(f"Searching for paper by ID: {content_id}")
         try:
@@ -120,17 +125,45 @@ def search_by_id(self, content_id: str, download: bool = False, dirpath: Path =
                 logger.warning(f"No result found for ID: {content_id}")
                 return None
 
-            if download:
-                file_path = self._download_result(result, dirpath)
-                return self._format_result_metadata(result, file_path=file_path)
-            else:
-                metadata = self._format_result_metadata(result)
-                text = result.summary.strip()
-                return Document(text=text, metadata=metadata)
+            return self._process_results([result], download, dirpath, include_summary)[0]
         except Exception as e:
             logger.error(f"Error fetching result for ID {content_id}: {e}")
             return None
 
+    def _process_results(
+        self,
+        results: List[arxiv.Result],
+        download: bool,
+        dirpath: Path,
+        include_summary: bool
+    ) -> Union[List[Dict], List["Document"]]:
+        """
+        Process arXiv search results.
+
+        Args:
+            results (List[arxiv.Result]): The list of arXiv result objects.
+            download (bool): Whether to download the papers as PDFs.
+            dirpath (Path): Directory path for the downloads (used if download=True).
+            include_summary (bool): Whether to include the paper summary in the returned metadata or documents.
+
+        Returns:
+            Union[List[Dict], List[Document]]: A list of metadata dictionaries if `download=True`,
+            otherwise a list of `Document` objects.
+        """
+        if download:
+            metadata_list = []
+            for result in results:
+                file_path = self._download_result(result, dirpath)
+                metadata_list.append(self._format_result_metadata(result, file_path=file_path, include_summary=include_summary))
+            return metadata_list
+        else:
+            documents = []
+            for result in results:
+                metadata = self._format_result_metadata(result, include_summary=include_summary)
+                text = result.summary.strip()
+                documents.append(Document(text=text, metadata=metadata))
+            return documents
+    
     def _download_result(self, result: arxiv.Result, dirpath: Path) -> Optional[str]:
         """
         Download a paper from an arXiv result object.
@@ -153,13 +186,14 @@ def _download_result(self, result: arxiv.Result, dirpath: Path) -> Optional[str]
             logger.error(f"Failed to download paper {result.title}: {e}")
             return None
 
-    def _format_result_metadata(self, result: arxiv.Result, file_path: Optional[str] = None) -> Dict:
+    def _format_result_metadata(self, result: arxiv.Result, file_path: Optional[str] = None, include_summary: bool = False) -> Dict:
         """
-        Format metadata from an arXiv result, optionally including file path.
+        Format metadata from an arXiv result, optionally including file path and summary.
 
         Args:
             result (arxiv.Result): The arXiv result object.
             file_path (Optional[str]): Path to the downloaded file.
+            include_summary (bool): Whether to include the summary in the metadata.
 
         Returns:
             Dict: A dictionary containing formatted metadata.
@@ -183,10 +217,11 @@ def _format_result_metadata(self, result: arxiv.Result, file_path: Optional[str]
                 "DOI": result.doi,
                 "journal_reference": result.journal_ref,
             })
-        
-        filtered_metadata = {key: value for key, value in metadata.items() if value is not None}
 
-        return filtered_metadata
+        if include_summary:
+            metadata["summary"] = result.summary.strip()
+        
+        return {key: value for key, value in metadata.items() if value is not None}
     
     def _format_date(self, date: Union[str, datetime]) -> str:
         """