dissertation/references.bib
2022-03-14 22:27:55 +00:00

2476 lines
214 KiB
BibTeX
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

@misc{gailly_gzip_2020,
title = {Gzip},
url = {https://www.gnu.org/software/gzip/},
urldate = {2022-03-14},
journal = {Gzip - GNU Project - Free Software Foundation},
author = {Gailly, Jean-loup},
month = aug,
year = {2020},
}
@misc{tea_hillion_nodate,
title = {Hillion {Gitea}},
url = {https://gitea.hillion.co.uk/},
abstract = {Gitea (Git with a cup of tea) is a painless self-hosted Git service written in Go},
language = {en-US},
urldate = {2022-03-13},
journal = {Hillion Gitea},
author = {tea, Gitea-Git with a cup of},
}
@inproceedings{padon_ivy_2016,
address = {New York, NY, USA},
series = {{PLDI} '16},
title = {Ivy: safety verification by interactive generalization},
isbn = {978-1-4503-4261-2},
shorttitle = {Ivy},
url = {https://doi.org/10.1145/2908080.2908118},
doi = {10.1145/2908080.2908118},
abstract = {Despite several decades of research, the problem of formal verification of infinite-state systems has resisted effective automation. We describe a system --- Ivy --- for interactively verifying safety of infinite-state systems. Ivy's key principle is that whenever verification fails, Ivy graphically displays a concrete counterexample to induction. The user then interactively guides generalization from this counterexample. This process continues until an inductive invariant is found. Ivy searches for universally quantified invariants, and uses a restricted modeling language. This ensures that all verification conditions can be checked algorithmically. All user interactions are performed using graphical models, easing the user's task. We describe our initial experience with verifying several distributed protocols.},
urldate = {2022-03-10},
booktitle = {Proceedings of the 37th {ACM} {SIGPLAN} {Conference} on {Programming} {Language} {Design} and {Implementation}},
publisher = {Association for Computing Machinery},
author = {Padon, Oded and McMillan, Kenneth L. and Panda, Aurojit and Sagiv, Mooly and Shoham, Sharon},
month = jun,
year = {2016},
keywords = {counterexamples to induction, distributed systems, invariant inference, safety verification},
pages = {614--630},
}
@inproceedings{nelson_scaling_2019,
address = {New York, NY, USA},
series = {{SOSP} '19},
title = {Scaling symbolic evaluation for automated verification of systems code with {Serval}},
isbn = {978-1-4503-6873-5},
url = {https://doi.org/10.1145/3341301.3359641},
doi = {10.1145/3341301.3359641},
abstract = {This paper presents Serval, a framework for developing automated verifiers for systems software. Serval provides an extensible infrastructure for creating verifiers by lifting interpreters under symbolic evaluation, and a systematic approach to identifying and repairing verification performance bottlenecks using symbolic profiling and optimizations. Using Serval, we build automated verifiers for the RISC-V, x86--32, LLVM, and BPF instruction sets. We report our experience of retrofitting CertiKOS and Komodo, two systems previously verified using Coq and Dafny, respectively, for automated verification using Serval, and discuss trade-offs of different verification methodologies. In addition, we apply Serval to the Keystone security monitor and the BPF compilers in the Linux kernel, and uncover 18 new bugs through verification, all confirmed and fixed by developers.},
urldate = {2022-03-07},
booktitle = {Proceedings of the 27th {ACM} {Symposium} on {Operating} {Systems} {Principles}},
publisher = {Association for Computing Machinery},
author = {Nelson, Luke and Bornholt, James and Gu, Ronghui and Baumann, Andrew and Torlak, Emina and Wang, Xi},
month = oct,
year = {2019},
pages = {225--242},
}
@inproceedings{ma_i4_2019,
address = {New York, NY, USA},
series = {{SOSP} '19},
title = {I4: incremental inference of inductive invariants for verification of distributed protocols},
isbn = {978-1-4503-6873-5},
shorttitle = {I4},
url = {https://doi.org/10.1145/3341301.3359651},
doi = {10.1145/3341301.3359651},
abstract = {Designing and implementing distributed systems correctly is a very challenging task. Recently, formal verification has been successfully used to prove the correctness of distributed systems. At the heart of formal verification lies a computer-checked proof with an inductive invariant. Finding this inductive invariant, however, is the most difficult part of the proof. Alas, current proof techniques require inductive invariants to be found manually---and painstakingly---by the developer. In this paper, we present a new approach, Incremental Inference of Inductive Invariants (I4), to automatically generate inductive invariants for distributed protocols. The essence of our idea is simple: the inductive invariant of a finite instance of the protocol can be used to infer a general inductive invariant for the infinite distributed protocol. In I4, we create a finite instance of the protocol; use a model checking tool to automatically derive the inductive invariant for this finite instance; and generalize this invariant to an inductive invariant for the infinite protocol. Our experiments show that I4 can prove the correctness of several distributed protocols like Chord, 2PC and Transaction Chains with little to no human effort.},
urldate = {2022-03-07},
booktitle = {Proceedings of the 27th {ACM} {Symposium} on {Operating} {Systems} {Principles}},
publisher = {Association for Computing Machinery},
author = {Ma, Haojun and Goel, Aman and Jeannin, Jean-Baptiste and Kapritsos, Manos and Kasikci, Baris and Sakallah, Karem A.},
month = oct,
year = {2019},
pages = {370--384},
}
@inproceedings{hawblitzel_ironfleet_2015,
address = {New York, NY, USA},
series = {{SOSP} '15},
title = {{IronFleet}: proving practical distributed systems correct},
isbn = {978-1-4503-3834-9},
shorttitle = {{IronFleet}},
url = {https://doi.org/10.1145/2815400.2815428},
doi = {10.1145/2815400.2815428},
abstract = {Distributed systems are notorious for harboring subtle bugs. Verification can, in principle, eliminate these bugs a priori, but verification has historically been difficult to apply at full-program scale, much less distributed-system scale. We describe a methodology for building practical and provably correct distributed systems based on a unique blend of TLA-style state-machine refinement and Hoare-logic verification. We demonstrate the methodology on a complex implementation of a Paxos-based replicated state machine library and a lease-based sharded key-value store. We prove that each obeys a concise safety specification, as well as desirable liveness requirements. Each implementation achieves performance competitive with a reference system. With our methodology and lessons learned, we aim to raise the standard for distributed systems from "tested" to "correct."},
urldate = {2022-03-07},
booktitle = {Proceedings of the 25th {Symposium} on {Operating} {Systems} {Principles}},
publisher = {Association for Computing Machinery},
author = {Hawblitzel, Chris and Howell, Jon and Kapritsos, Manos and Lorch, Jacob R. and Parno, Bryan and Roberts, Michael L. and Setty, Srinath and Zill, Brian},
month = oct,
year = {2015},
pages = {1--17},
}
@inproceedings{song_practical_2000,
title = {Practical techniques for searches on encrypted data},
doi = {10.1109/SECPRI.2000.848445},
abstract = {It is desirable to store data on data storage servers such as mail servers and file servers in encrypted form to reduce security and privacy risks. But this usually implies that one has to sacrifice functionality for security. For example, if a client wishes to retrieve only documents containing certain words, it was not previously known how to let the data storage server perform the search and answer the query, without loss of data confidentiality. We describe our cryptographic schemes for the problem of searching on encrypted data and provide proofs of security for the resulting crypto systems. Our techniques have a number of crucial advantages. They are provably secure: they provide provable secrecy for encryption, in the sense that the untrusted server cannot learn anything about the plaintext when only given the ciphertext; they provide query isolation for searches, meaning that the untrusted server cannot learn anything more about the plaintext than the search result; they provide controlled searching, so that the untrusted server cannot search for an arbitrary word without the user's authorization; they also support hidden queries, so that the user may ask the untrusted server to search for a secret word without revealing the word to the server. The algorithms presented are simple, fast (for a document of length n, the encryption and search algorithms only need O(n) stream cipher and block cipher operations), and introduce almost no space and communication overhead, and hence are practical to use today.},
booktitle = {Proceeding 2000 {IEEE} {Symposium} on {Security} and {Privacy}. {S} {P} 2000},
author = {Song, Dawn Xiaoding and Wagner, D. and Perrig, A.},
month = may,
year = {2000},
note = {ISSN: 1081-6011},
keywords = {Authorization, Contracts, Cryptography, Data privacy, Data security, Electronic mail, File servers, Memory, Postal services, US Government agencies},
pages = {44--55},
}
@inproceedings{boldyreva_order-preserving_2009,
address = {Berlin, Heidelberg},
series = {Lecture {Notes} in {Computer} {Science}},
title = {Order-{Preserving} {Symmetric} {Encryption}},
isbn = {978-3-642-01001-9},
doi = {10.1007/978-3-642-01001-9_13},
abstract = {We initiate the cryptographic study of order-preserving symmetric encryption (OPE), a primitive suggested in the database community by Agrawal et al. (SIGMOD 04) for allowing efficient range queries on encrypted data. Interestingly, we first show that a straightforward relaxation of standard security notions for encryption such as indistinguishability against chosen-plaintext attack (IND-CPA) is unachievable by a practical OPE scheme. Instead, we propose a security notion in the spirit of pseudorandom functions (PRFs) and related primitives asking that an OPE scheme look “as-random-as-possible” subject to the order-preserving constraint. We then design an efficient OPE scheme and prove its security under our notion based on pseudorandomness of an underlying blockcipher. Our construction is based on a natural relation we uncover between a random order-preserving function and the hypergeometric probability distribution. In particular, it makes black-box use of an efficient sampling algorithm for the latter.},
language = {en},
booktitle = {Advances in {Cryptology} - {EUROCRYPT} 2009},
publisher = {Springer},
author = {Boldyreva, Alexandra and Chenette, Nathan and Lee, Younho and ONeill, Adam},
editor = {Joux, Antoine},
year = {2009},
keywords = {Encrypt Data, Encryption Algorithm, Range Query, Sampling Algorithm, Symmetric Encryption},
pages = {224--241},
}
@article{bonawitz_towards_2019,
title = {Towards {Federated} {Learning} at {Scale}: {System} {Design}},
volume = {1},
shorttitle = {Towards {Federated} {Learning} at {Scale}},
url = {https://proceedings.mlsys.org/paper/2019/hash/bd686fd640be98efaae0091fa301e613-Abstract.html},
language = {en},
urldate = {2022-02-28},
journal = {Proceedings of Machine Learning and Systems},
author = {Bonawitz, Keith and Eichner, Hubert and Grieskamp, Wolfgang and Huba, Dzmitry and Ingerman, Alex and Ivanov, Vladimir and Kiddon, Chloé and Konečný, Jakub and Mazzocchi, Stefano and McMahan, Brendan and Van Overveldt, Timon and Petrou, David and Ramage, Daniel and Roselander, Jason},
month = apr,
year = {2019},
pages = {374--388},
}
@article{hunt_ryoan_2018,
title = {Ryoan: {A} {Distributed} {Sandbox} for {Untrusted} {Computation} on {Secret} {Data}},
volume = {35},
issn = {0734-2071},
shorttitle = {Ryoan},
url = {https://doi.org/10.1145/3231594},
doi = {10.1145/3231594},
abstract = {Users of modern data-processing services such as tax preparation or genomic screening are forced to trust them with data that the users wish to keep secret. Ryoan1 protects secret data while it is processed by services that the data owner does not trust. Accomplishing this goal in a distributed setting is difficult, because the user has no control over the service providers or the computational platform. Confining code to prevent it from leaking secrets is notoriously difficult, but Ryoan benefits from new hardware and a request-oriented data model. Ryoan provides a distributed sandbox, leveraging hardware enclaves (e.g., Intels software guard extensions (SGX) [40]) to protect sandbox instances from potentially malicious computing platforms. The protected sandbox instances confine untrusted data-processing modules to prevent leakage of the users input data. Ryoan is designed for a request-oriented data model, where confined modules only process input once and do not persist state about the input. We present the design and prototype implementation of Ryoan and evaluate it on a series of challenging problems including email filtering, health analysis, image processing and machine translation.},
number = {4},
urldate = {2022-02-28},
journal = {ACM Transactions on Computer Systems},
author = {Hunt, Tyler and Zhu, Zhiting and Xu, Yuanzhong and Peter, Simon and Witchel, Emmett},
month = dec,
year = {2018},
keywords = {Intel SGX, enclaves, private computation, sandboxing, untrusted OS},
pages = {13:1--13:32},
}
@inproceedings{popa_cryptdb_2011,
address = {New York, NY, USA},
series = {{SOSP} '11},
title = {{CryptDB}: protecting confidentiality with encrypted query processing},
isbn = {978-1-4503-0977-6},
shorttitle = {{CryptDB}},
url = {https://doi.org/10.1145/2043556.2043566},
doi = {10.1145/2043556.2043566},
abstract = {Online applications are vulnerable to theft of sensitive information because adversaries can exploit software bugs to gain access to private data, and because curious or malicious administrators may capture and leak data. CryptDB is a system that provides practical and provable confidentiality in the face of these attacks for applications backed by SQL databases. It works by executing SQL queries over encrypted data using a collection of efficient SQL-aware encryption schemes. CryptDB can also chain encryption keys to user passwords, so that a data item can be decrypted only by using the password of one of the users with access to that data. As a result, a database administrator never gets access to decrypted data, and even if all servers are compromised, an adversary cannot decrypt the data of any user who is not logged in. An analysis of a trace of 126 million SQL queries from a production MySQL server shows that CryptDB can support operations over encrypted data for 99.5\% of the 128,840 columns seen in the trace. Our evaluation shows that CryptDB has low overhead, reducing throughput by 14.5\% for phpBB, a web forum application, and by 26\% for queries from TPC-C, compared to unmodified MySQL. Chaining encryption keys to user passwords requires 11--13 unique schema annotations to secure more than 20 sensitive fields and 2--7 lines of source code changes for three multi-user web applications.},
urldate = {2022-02-28},
booktitle = {Proceedings of the {Twenty}-{Third} {ACM} {Symposium} on {Operating} {Systems} {Principles}},
publisher = {Association for Computing Machinery},
author = {Popa, Raluca Ada and Redfield, Catherine M. S. and Zeldovich, Nickolai and Balakrishnan, Hari},
month = oct,
year = {2011},
pages = {85--100},
}
@article{dean_mapreduce_2004,
title = {{MapReduce}: {Simplified} data processing on large clusters},
author = {Dean, Jeffrey and Ghemawat, Sanjay},
year = {2004},
}
@inproceedings{malewicz_pregel_2010,
address = {New York, NY, USA},
series = {{SIGMOD} '10},
title = {Pregel: a system for large-scale graph processing},
isbn = {978-1-4503-0032-2},
shorttitle = {Pregel},
url = {https://doi.org/10.1145/1807167.1807184},
doi = {10.1145/1807167.1807184},
abstract = {Many practical computing problems concern large graphs. Standard examples include the Web graph and various social networks. The scale of these graphs - in some cases billions of vertices, trillions of edges - poses challenges to their efficient processing. In this paper we present a computational model suitable for this task. Programs are expressed as a sequence of iterations, in each of which a vertex can receive messages sent in the previous iteration, send messages to other vertices, and modify its own state and that of its outgoing edges or mutate graph topology. This vertex-centric approach is flexible enough to express a broad set of algorithms. The model has been designed for efficient, scalable and fault-tolerant implementation on clusters of thousands of commodity computers, and its implied synchronicity makes reasoning about programs easier. Distribution-related details are hidden behind an abstract API. The result is a framework for processing large graphs that is expressive and easy to program.},
urldate = {2022-02-21},
booktitle = {Proceedings of the 2010 {ACM} {SIGMOD} {International} {Conference} on {Management} of data},
publisher = {Association for Computing Machinery},
author = {Malewicz, Grzegorz and Austern, Matthew H. and Bik, Aart J.C and Dehnert, James C. and Horn, Ilan and Leiser, Naty and Czajkowski, Grzegorz},
month = jun,
year = {2010},
keywords = {distributed computing, graph algorigthms},
pages = {135--146},
}
@article{isard_dryad_2007,
title = {Dryad: distributed data-parallel programs from sequential building blocks},
volume = {41},
issn = {0163-5980},
shorttitle = {Dryad},
url = {https://doi.org/10.1145/1272998.1273005},
doi = {10.1145/1272998.1273005},
abstract = {Dryad is a general-purpose distributed execution engine for coarse-grain data-parallel applications. A Dryad application combines computational "vertices" with communication "channels" to form a dataflow graph. Dryad runs the application by executing the vertices of this graph on a set of available computers, communicating as appropriate through flies, TCP pipes, and shared-memory FIFOs. The vertices provided by the application developer are quite simple and are usually written as sequential programs with no thread creation or locking. Concurrency arises from Dryad scheduling vertices to run simultaneously on multiple computers, or on multiple CPU cores within a computer. The application can discover the size and placement of data at run time, and modify the graph as the computation progresses to make efficient use of the available resources. Dryad is designed to scale from powerful multi-core single computers, through small clusters of computers, to data centers with thousands of computers. The Dryad execution engine handles all the difficult problems of creating a large distributed, concurrent application: scheduling the use of computers and their CPUs, recovering from communication or computer failures, and transporting data between vertices.},
number = {3},
urldate = {2022-02-21},
journal = {ACM SIGOPS Operating Systems Review},
author = {Isard, Michael and Budiu, Mihai and Yu, Yuan and Birrell, Andrew and Fetterly, Dennis},
month = mar,
year = {2007},
keywords = {cluster computing, concurrency, dataflow, distributed programming},
pages = {59--72},
}
@misc{noauthor_osdi_nodate,
title = {{OSDI} '04 {Abstract}},
url = {https://www.usenix.org/legacy/events/osdi04/tech/dean.html},
urldate = {2022-02-21},
}
@article{dean_mapreduce_2008,
title = {{MapReduce}: simplified data processing on large clusters},
volume = {51},
issn = {0001-0782},
shorttitle = {{MapReduce}},
url = {https://doi.org/10.1145/1327452.1327492},
doi = {10.1145/1327452.1327492},
abstract = {MapReduce is a programming model and an associated implementation for processing and generating large datasets that is amenable to a broad variety of real-world tasks. Users specify the computation in terms of a map and a reduce function, and the underlying runtime system automatically parallelizes the computation across large-scale clusters of machines, handles machine failures, and schedules inter-machine communication to make efficient use of the network and disks. Programmers find the system easy to use: more than ten thousand distinct MapReduce programs have been implemented internally at Google over the past four years, and an average of one hundred thousand MapReduce jobs are executed on Google's clusters every day, processing a total of more than twenty petabytes of data per day.},
number = {1},
urldate = {2022-02-21},
journal = {Communications of the ACM},
author = {Dean, Jeffrey and Ghemawat, Sanjay},
month = jan,
year = {2008},
pages = {107--113},
}
@article{hindman_mesos_nodate,
title = {Mesos: {A} {Platform} for {Fine}-{Grained} {Resource} {Sharing} in the {Data} {Center}},
abstract = {We present Mesos, a platform for sharing commodity clusters between multiple diverse cluster computing frameworks, such as Hadoop and MPI. Sharing improves cluster utilization and avoids per-framework data replication. Mesos shares resources in a fine-grained manner, allowing frameworks to achieve data locality by taking turns reading data stored on each machine. To support the sophisticated schedulers of todays frameworks, Mesos introduces a distributed two-level scheduling mechanism called resource offers. Mesos decides how many resources to offer each framework, while frameworks decide which resources to accept and which computations to run on them. Our results show that Mesos can achieve near-optimal data locality when sharing the cluster among diverse frameworks, can scale to 50,000 (emulated) nodes, and is resilient to failures.},
language = {en},
author = {Hindman, Benjamin and Konwinski, Andy and Zaharia, Matei and Ghodsi, Ali and Joseph, Anthony D and Katz, Randy and Shenker, Scott and Stoica, Ion},
pages = {14},
}
@article{delimitrou_quasar_nodate,
title = {Quasar: {Resource}-{Efficient} and {QoS}-{Aware} {Cluster} {Management}},
abstract = {Cloud computing promises flexibility and high performance for users and high cost-efficiency for operators. Nevertheless, most cloud facilities operate at very low utilization, hurting both cost effectiveness and future scalability.},
language = {en},
author = {Delimitrou, Christina and Kozyrakis, Christos},
pages = {17},
}
@inproceedings{gog_firmament_2016,
title = {Firmament: {Fast}, {Centralized} {Cluster} {Scheduling} at {Scale}},
isbn = {978-1-931971-33-1},
shorttitle = {Firmament},
url = {https://www.usenix.org/conference/osdi16/technical-sessions/presentation/gog},
language = {en},
urldate = {2022-02-14},
author = {Gog, Ionel and Schwarzkopf, Malte and Gleave, Adam and Watson, Robert N. M. and Hand, Steven},
year = {2016},
pages = {99--115},
}
@misc{zhang_fuxi_2014,
type = {Proceedings {Paper}},
title = {Fuxi: {A} fault-tolerant resource management and job scheduling system at internet scale},
copyright = {cc\_by\_nc\_nd\_4},
shorttitle = {Fuxi},
url = {http://www.vldb.org/pvldb/vol7/p1393-zhang.pdf},
abstract = {Scalability and fault-tolerance are two fundamental challenges for all distributed computing at Internet scale. Despite many recent advances from both academia and industry, these two problems are still far from settled. In this paper, we present Fuxi, a resource management and job scheduling system that is capable of handling the kind of workload at Alibaba where hundreds of terabytes of data are generated and analyzed everyday to help optimize the company's business operations and user experiences. We employ several novel techniques to enable Fuxi to perform efficient scheduling of hundreds of thousands of concurrent tasks over large clusters with thousands of nodes: 1) an incremental resource management protocol that supports multi-dimensional resource allocation and data locality; 2) user-transparent failure recovery where failures of any Fuxi components will not impact the execution of user jobs; and 3) an effective detection mechanism and a multi-level blacklisting scheme that prevents them from affecting job execution. Our evaluation results demonstrate that 95\% and 91\% scheduled CPU/memory utilization can be fulfilled under synthetic workloads, and Fuxi is capable of achieving 2.36T-B/minute throughput in GraySort. Additionally, the same Fuxi job only experiences approximately 16\% slowdown under a 5\% fault-injection rate. The slowdown only grows to 20\% when we double the fault-injection rate to 10\%. Fuxi has been deployed in our production environment since 2009, and it now manages hundreds of thousands of server nodes.},
language = {en},
urldate = {2022-02-14},
journal = {Proceedings of the VLDB Endowment},
author = {Zhang, Z. and Li, C. and Tao, Y. and Yang, R. and Tang, H. and Xu, J.},
month = aug,
year = {2014},
note = {Conference Name: 40th International Conference on Very Large Data Bases
ISSN: 2150-8097
Issue: 13
Meeting Name: 40th International Conference on Very Large Data Bases
Number: 13
Pages: 1393-1404
Place: Hangzhou, China
Publisher: VLDB Endowment Inc.
Volume: 7},
}
@inproceedings{verma_large-scale_2015,
address = {New York, NY, USA},
series = {{EuroSys} '15},
title = {Large-scale cluster management at {Google} with {Borg}},
isbn = {978-1-4503-3238-5},
url = {https://doi.org/10.1145/2741948.2741964},
doi = {10.1145/2741948.2741964},
abstract = {Google's Borg system is a cluster manager that runs hundreds of thousands of jobs, from many thousands of different applications, across a number of clusters each with up to tens of thousands of machines. It achieves high utilization by combining admission control, efficient task-packing, over-commitment, and machine sharing with process-level performance isolation. It supports high-availability applications with runtime features that minimize fault-recovery time, and scheduling policies that reduce the probability of correlated failures. Borg simplifies life for its users by offering a declarative job specification language, name service integration, real-time job monitoring, and tools to analyze and simulate system behavior. We present a summary of the Borg system architecture and features, important design decisions, a quantitative analysis of some of its policy decisions, and a qualitative examination of lessons learned from a decade of operational experience with it.},
urldate = {2022-02-14},
booktitle = {Proceedings of the {Tenth} {European} {Conference} on {Computer} {Systems}},
publisher = {Association for Computing Machinery},
author = {Verma, Abhishek and Pedrosa, Luis and Korupolu, Madhukar and Oppenheimer, David and Tune, Eric and Wilkes, John},
month = apr,
year = {2015},
pages = {1--17},
}
@article{sewell_x86-tso_2010,
title = {x86-{TSO}: a rigorous and usable programmer's model for x86 multiprocessors},
volume = {53},
issn = {0001-0782},
shorttitle = {x86-{TSO}},
url = {https://doi.org/10.1145/1785414.1785443},
doi = {10.1145/1785414.1785443},
abstract = {Exploiting the multiprocessors that have recently become ubiquitous requires high-performance and reliable concurrent systems code, for concurrent data structures, operating system kernels, synchronization libraries, compilers, and so on. However, concurrent programming, which is always challenging, is made much more so by two problems. First, real multiprocessors typically do not provide the sequentially consistent memory that is assumed by most work on semantics and verification. Instead, they have relaxed memory models, varying in subtle ways between processor families, in which different hardware threads may have only loosely consistent views of a shared memory. Second, the public vendor architectures, supposedly specifying what programmers can rely on, are often in ambiguous informal prose (a particularly poor medium for loose specifications), leading to widespread confusion. In this paper we focus on x86 processors. We review several recent Intel and AMD specifications, showing that all contain serious ambiguities, some are arguably too weak to program above, and some are simply unsound with respect to actual hardware. We present a new x86-TSO programmer's model that, to the best of our knowledge, suffers from none of these problems. It is mathematically precise (rigorously defined in HOL4) but can be presented as an intuitive abstract machine which should be widely accessible to working programmers. We illustrate how this can be used to reason about the correctness of a Linux spinlock implementation and describe a general theory of data-race freedom for x86-TSO. This should put x86 multiprocessor system building on a more solid foundation; it should also provide a basis for future work on verification of such systems.},
number = {7},
urldate = {2022-02-10},
journal = {Communications of the ACM},
author = {Sewell, Peter and Sarkar, Susmit and Owens, Scott and Nardelli, Francesco Zappa and Myreen, Magnus O.},
month = jul,
year = {2010},
pages = {89--97},
}
@article{hunt_zookeeper_2010,
title = {{ZooKeeper}: {Wait}-free coordination for {Internet}-scale systems},
abstract = {In this paper, we describe ZooKeeper, a service for coordinating processes of distributed applications. Since ZooKeeper is part of critical infrastructure, ZooKeeper aims to provide a simple and high performance kernel for building more complex coordination primitives at the client. It incorporates elements from group messaging, shared registers, and distributed lock services in a replicated, centralized service. The interface exposed by ZooKeeper has the wait-free aspects of shared registers with an event-driven mechanism similar to cache invalidations of distributed file systems to provide a simple, yet powerful coordination service.},
language = {en},
author = {Hunt, Patrick and Konar, Mahadev and Junqueira, Flavio P and Reed, Benjamin},
year = {2010},
pages = {14},
}
@article{van_renesse_paxos_2015,
title = {Paxos {Made} {Moderately} {Complex}},
volume = {47},
issn = {0360-0300},
url = {https://doi.org/10.1145/2673577},
doi = {10.1145/2673577},
abstract = {This article explains the full reconfigurable multidecree Paxos (or multi-Paxos) protocol. Paxos is by no means a simple protocol, even though it is based on relatively simple invariants. We provide pseudocode and explain it guided by invariants. We initially avoid optimizations that complicate comprehension. Next we discuss liveness, list various optimizations that make the protocol practical, and present variants of the protocol.},
number = {3},
urldate = {2022-02-07},
journal = {ACM Computing Surveys},
author = {Van Renesse, Robbert and Altinbuken, Deniz},
month = feb,
year = {2015},
keywords = {Replicated state machines, consensus, voting},
pages = {42:1--42:36},
}
@inproceedings{burrows_chubby_2006,
address = {USA},
series = {{OSDI} '06},
title = {The {Chubby} lock service for loosely-coupled distributed systems},
isbn = {978-1-931971-47-8},
abstract = {We describe our experiences with the Chubby lock service, which is intended to provide coarse-grained locking as well as reliable (though low-volume) storage for a loosely-coupled distributed system. Chubby provides an interface much like a distributed file system with advisory locks, but the design emphasis is on availability and reliability, as opposed to high performance. Many instances of the service have been used for over a year, with several of them each handling a few tens of thousands of clients concurrently. The paper describes the initial design and expected use, compares it with actual use, and explains how the design had to be modified to accommodate the differences.},
urldate = {2022-02-07},
booktitle = {Proceedings of the 7th symposium on {Operating} systems design and implementation},
publisher = {USENIX Association},
author = {Burrows, Mike},
month = nov,
year = {2006},
pages = {335--350},
}
@inproceedings{ongaro_search_2014,
title = {In {Search} of an {Understandable} {Consensus} {Algorithm}},
isbn = {978-1-931971-10-2},
url = {https://www.usenix.org/conference/atc14/technical-sessions/presentation/ongaro},
language = {en},
urldate = {2022-02-07},
author = {Ongaro, Diego and Ousterhout, John},
year = {2014},
pages = {305--319},
}
@misc{the_docker_team_its_2014,
title = {Its {Here}: {Docker} 1.0 {\textbar} {Docker} {Blog}},
shorttitle = {Its {Here}},
url = {https://blog.docker.com/2014/06/its-here-docker-1-0/},
urldate = {2014-06-11},
author = {The Docker Team},
month = jun,
year = {2014},
}
@misc{kerrisk_mount_namespaces7_2021,
title = {mount\_namespaces(7)},
url = {https://man7.org/linux/man-pages/man7/mount_namespaces.7.html},
urldate = {2022-02-03},
journal = {Linux manual pages},
author = {Kerrisk, Michael and Biederman, Eric W.},
month = aug,
year = {2021},
}
@article{yasunori_kernel-based_2011,
title = {Kernel-based {Virtual} {Machine} {Technology}},
volume = {47},
language = {en},
number = {3},
journal = {FUJITSU Sci. Tech. J.},
author = {Yasunori, Goto},
year = {2011},
pages = {7},
}
@misc{hamilton_xen--nitro_nodate,
type = {Blog},
title = {Xen-on-{Nitro}: {AWS} {Nitro} for {Legacy} {Instances} {Perspectives}},
shorttitle = {Xen-on-{Nitro}},
url = {https://perspectives.mvdirona.com/2021/11/xen-on-nitro-aws-nitro-for-legacy-instances/},
language = {en-US},
urldate = {2022-02-03},
journal = {Perspectives},
author = {Hamilton, James},
}
@article{duda_borrowed-virtual-time_1999,
title = {Borrowed-virtual-time ({BVT}) scheduling: supporting latency-sensitive threads in a general-purpose scheduler},
volume = {33},
issn = {0163-5980},
shorttitle = {Borrowed-virtual-time ({BVT}) scheduling},
url = {https://doi.org/10.1145/319344.319169},
doi = {10.1145/319344.319169},
abstract = {Systems need to run a larger and more diverse set of applications, from real-time to interactive to batch, on uniprocessor and multiprocessor platforms. However, most schedulers either do not address latency requirements or are specialized to complex real-time paradigms, limiting their applicability to general-purpose systems.In this paper, we present Borrowed-Virtual-Time (BVT) Scheduling, showing that it provides low-latency for real-time and interactive applications yet weighted sharing of the CPU across applications according to system policy, even with thread failure at the real-time level, all with a low-overhead implementation on multiprocessors as well as uniprocessors. It makes minimal demands on application developers, and can be used with a reservation or admission control module for hard real-time applications.},
number = {5},
urldate = {2022-02-03},
journal = {ACM SIGOPS Operating Systems Review},
author = {Duda, Kenneth J. and Cheriton, David R.},
month = dec,
year = {1999},
pages = {261--276},
}
@techreport{vmware_inc_understanding_2008,
title = {Understanding {Full} {Virtualization}, {Paravirtualization} and {Hardware} {Assist}},
url = {https://www.vmware.com/content/dam/digitalmarketing/vmware/en/pdf/techpaper/VMware_paravirtualization.pdf},
abstract = {In 1998, VMware figured out how to virtualize the x86 platform, once thought to be impossible, and created the market for x86 virtualization. The solution was a combination of binary translation and direct execution on the processor that allowed multiple guest OSes to run in full isolation on the same computer with readily affordable virtualization overhead.The savings that tens of thousands of companies have generated from the deployment of this technology is further driving the rapid adoption of virtualized computing from the desktop to the data center. As new vendors enter the space and attempt to differentiate their products, many are creating confusion with their marketing claims and terminology. For example, while hardware assist is a valuable technique that will mature and expand the envelope of workloads that can be virtualized, paravirtualization is not an entirely new technology that offers an order ofmagnitude greater performance.While this is a complex and rapidly evolving space, the technologies employed can be readily explained to help companies understand their options and choose a path forward. This white paper attempts to clarify the various techniques used to virtualize x86 hardware, the strengths and weaknesses of each, and VMware™s community approach to develop and employ the most effective of the emerging virtualization techniques. Figure 1 provides a summary timeline of x86virtualization technologies from VMware™s binary translation to the recent application of kernel paravirtualization and hardware-assisted virtualization.},
urldate = {2022-02-02},
author = {VMware, Inc.},
month = mar,
year = {2008},
}
@inproceedings{litton_light-weight_2016,
title = {\{{Light}-{Weight}\} {Contexts}: {An} \{{OS}\} {Abstraction} for {Safety} and {Performance}},
isbn = {978-1-931971-33-1},
shorttitle = {\{{Light}-{Weight}\} {Contexts}},
url = {https://www.usenix.org/conference/osdi16/technical-sessions/presentation/litton},
language = {en},
urldate = {2022-01-31},
author = {Litton, James and Vahldiek-Oberwagner, Anjo and Elnikety, Eslam and Garg, Deepak and Bhattacharjee, Bobby and Druschel, Peter},
year = {2016},
pages = {49--64},
}
@inproceedings{manco_my_2017,
address = {New York, NY, USA},
series = {{SOSP} '17},
title = {My {VM} is {Lighter} (and {Safer}) than your {Container}},
isbn = {978-1-4503-5085-3},
url = {https://doi.org/10.1145/3132747.3132763},
doi = {10.1145/3132747.3132763},
abstract = {Containers are in great demand because they are lightweight when compared to virtual machines. On the downside, containers offer weaker isolation than VMs, to the point where people run containers in virtual machines to achieve proper isolation. In this paper, we examine whether there is indeed a strict tradeoff between isolation (VMs) and efficiency (containers). We find that VMs can be as nimble as containers, as long as they are small and the toolstack is fast enough. We achieve lightweight VMs by using unikernels for specialized applications and with Tinyx, a tool that enables creating tailor-made, trimmed-down Linux virtual machines. By themselves, lightweight virtual machines are not enough to ensure good performance since the virtualization control plane (the toolstack) becomes the performance bottleneck. We present LightVM, a new virtualization solution based on Xen that is optimized to offer fast boot-times regardless of the number of active VMs. LightVM features a complete redesign of Xen's control plane, transforming its centralized operation to a distributed one where interactions with the hypervisor are reduced to a minimum. LightVM can boot a VM in 2.3ms, comparable to fork/exec on Linux (1ms), and two orders of magnitude faster than Docker. LightVM can pack thousands of LightVM guests on modest hardware with memory and CPU usage comparable to that of processes.},
urldate = {2022-01-31},
booktitle = {Proceedings of the 26th {Symposium} on {Operating} {Systems} {Principles}},
publisher = {Association for Computing Machinery},
author = {Manco, Filipe and Lupu, Costin and Schmidt, Florian and Mendes, Jose and Kuenzer, Simon and Sati, Sumit and Yasukata, Kenichi and Raiciu, Costin and Huici, Felipe},
month = oct,
year = {2017},
keywords = {Virtualization, Xen, containers, hypervisor, operating systems, specialization, unikernels, virtual machine},
pages = {218--233},
}
@inproceedings{soltesz_container-based_2007,
address = {New York, NY, USA},
series = {{EuroSys} '07},
title = {Container-based operating system virtualization: a scalable, high-performance alternative to hypervisors},
isbn = {978-1-59593-636-3},
shorttitle = {Container-based operating system virtualization},
url = {https://doi.org/10.1145/1272996.1273025},
doi = {10.1145/1272996.1273025},
abstract = {Hypervisors, popularized by Xen and VMware, are quickly becoming commodity. They are appropriate for many usage scenarios, but there are scenarios that require system virtualization with high degrees of both isolation and efficiency. Examples include HPC clusters, the Grid, hosting centers, and PlanetLab. We present an alternative to hypervisors that is better suited to such scenarios. The approach is a synthesis of prior work on resource containers and security containers applied to general-purpose, time-shared operating systems. Examples of such container-based systems include Solaris 10, Virtuozzo for Linux, and Linux-VServer. As a representative instance of container-based systems, this paper describes the design and implementation of Linux-VServer. In addition, it contrasts the architecture of Linux-VServer with current generations of Xen, and shows how Linux-VServer provides comparable support for isolation and superior system efficiency.},
urldate = {2022-01-31},
booktitle = {Proceedings of the 2nd {ACM} {SIGOPS}/{EuroSys} {European} {Conference} on {Computer} {Systems} 2007},
publisher = {Association for Computing Machinery},
author = {Soltesz, Stephen and Pötzl, Herbert and Fiuczynski, Marc E. and Bavier, Andy and Peterson, Larry},
month = mar,
year = {2007},
keywords = {Linux-VServer, Xen, alternative, container, hypervisor, operating, system, virtualization},
pages = {275--287},
}
@article{barham_xen_2003,
title = {Xen and the art of virtualization},
volume = {37},
issn = {0163-5980},
url = {https://doi.org/10.1145/1165389.945462},
doi = {10.1145/1165389.945462},
abstract = {Numerous systems have been designed which use virtualization to subdivide the ample resources of a modern computer. Some require specialized hardware, or cannot support commodity operating systems. Some target 100\% binary compatibility at the expense of performance. Others sacrifice security or functionality for speed. Few offer resource isolation or performance guarantees; most provide only best-effort provisioning, risking denial of service.This paper presents Xen, an x86 virtual machine monitor which allows multiple commodity operating systems to share conventional hardware in a safe and resource managed fashion, but without sacrificing either performance or functionality. This is achieved by providing an idealized virtual machine abstraction to which operating systems such as Linux, BSD and Windows XP, can be ported with minimal effort.Our design is targeted at hosting up to 100 virtual machine instances simultaneously on a modern server. The virtualization approach taken by Xen is extremely efficient: we allow operating systems such as Linux and Windows XP to be hosted simultaneously for a negligible performance overhead --- at most a few percent compared with the unvirtualized case. We considerably outperform competing commercial and freely available solutions in a range of microbenchmarks and system-wide tests.},
number = {5},
urldate = {2022-01-31},
journal = {ACM SIGOPS Operating Systems Review},
author = {Barham, Paul and Dragovic, Boris and Fraser, Keir and Hand, Steven and Harris, Tim and Ho, Alex and Neugebauer, Rolf and Pratt, Ian and Warfield, Andrew},
month = oct,
year = {2003},
keywords = {hypervisors, paravirtualization, virtual machine monitors},
pages = {164--177},
}
@article{ritchie_unix_1974,
title = {The {UNIX} time-sharing system},
volume = {17},
issn = {0001-0782},
url = {https://doi.org/10.1145/361011.361061},
doi = {10.1145/361011.361061},
abstract = {UNIX is a general-purpose, multi-user, interactive operating system for the Digital Equipment Corporation PDP-11/40 and 11/45 computers. It offers a number of features seldom found even in larger operating systems, including: (1) a hierarchical file system incorporating demountable volumes; (2) compatible file, device, and inter-process I/O; (3) the ability to initiate asynchronous processes; (4) system command language selectable on a per-user basis; and (5) over 100 subsystems including a dozen languages. This paper discusses the nature and implementation of the file system and of the user command interface.},
number = {7},
urldate = {2022-01-24},
journal = {Communications of the ACM},
author = {Ritchie, Dennis M. and Thompson, Ken},
month = jul,
year = {1974},
keywords = {PDP-11, command language, file system, operating system, time-sharing},
pages = {365--375},
}
@article{madhavapeddy_unikernels_2013,
title = {Unikernels: library operating systems for the cloud},
volume = {41},
issn = {0163-5964},
shorttitle = {Unikernels},
url = {https://doi.org/10.1145/2490301.2451167},
doi = {10.1145/2490301.2451167},
abstract = {We present unikernels, a new approach to deploying cloud services via applications written in high-level source code. Unikernels are single-purpose appliances that are compile-time specialised into standalone kernels, and sealed against modification when deployed to a cloud platform. In return they offer significant reduction in image sizes, improved efficiency and security, and should reduce operational costs. Our Mirage prototype compiles OCaml code into unikernels that run on commodity clouds and offer an order of magnitude reduction in code size without significant performance penalty. The architecture combines static type-safety with a single address-space layout that can be made immutable via a hypervisor extension. Mirage contributes a suite of type-safe protocol libraries, and our results demonstrate that the hypervisor is a platform that overcomes the hardware compatibility issues that have made past library operating systems impractical to deploy in the real-world.},
number = {1},
urldate = {2022-01-24},
journal = {ACM SIGARCH Computer Architecture News},
author = {Madhavapeddy, Anil and Mortier, Richard and Rotsos, Charalampos and Scott, David and Singh, Balraj and Gazagnaire, Thomas and Smith, Steven and Hand, Steven and Crowcroft, Jon},
month = mar,
year = {2013},
keywords = {functional programming, hypervisor, microkernel},
pages = {461--472},
}
@inproceedings{baumann_multikernel_2009,
address = {New York, NY, USA},
series = {{SOSP} '09},
title = {The multikernel: a new {OS} architecture for scalable multicore systems},
isbn = {978-1-60558-752-3},
shorttitle = {The multikernel},
url = {https://doi.org/10.1145/1629575.1629579},
doi = {10.1145/1629575.1629579},
abstract = {Commodity computer systems contain more and more processor cores and exhibit increasingly diverse architectural tradeoffs, including memory hierarchies, interconnects, instruction sets and variants, and IO configurations. Previous high-performance computing systems have scaled in specific cases, but the dynamic nature of modern client and server workloads, coupled with the impossibility of statically optimizing an OS for all workloads and hardware variants pose serious challenges for operating system structures. We argue that the challenge of future multicore hardware is best met by embracing the networked nature of the machine, rethinking OS architecture using ideas from distributed systems. We investigate a new OS structure, the multikernel, that treats the machine as a network of independent cores, assumes no inter-core sharing at the lowest level, and moves traditional OS functionality to a distributed system of processes that communicate via message-passing. We have implemented a multikernel OS to show that the approach is promising, and we describe how traditional scalability problems for operating systems (such as memory management) can be effectively recast using messages and can exploit insights from distributed systems and networking. An evaluation of our prototype on multicore systems shows that, even on present-day machines, the performance of a multikernel is comparable with a conventional OS, and can scale better to support future hardware.},
urldate = {2022-01-24},
booktitle = {Proceedings of the {ACM} {SIGOPS} 22nd symposium on {Operating} systems principles},
publisher = {Association for Computing Machinery},
author = {Baumann, Andrew and Barham, Paul and Dagand, Pierre-Evariste and Harris, Tim and Isaacs, Rebecca and Peter, Simon and Roscoe, Timothy and Schüpbach, Adrian and Singhania, Akhilesh},
month = oct,
year = {2009},
keywords = {message passing, multicore processors, scalability},
pages = {29--44},
}
@article{engler_exokernel_1995,
title = {Exokernel: an operating system architecture for application-level resource management},
volume = {29},
issn = {0163-5980},
shorttitle = {Exokernel},
url = {https://doi.org/10.1145/224057.224076},
doi = {10.1145/224057.224076},
number = {5},
urldate = {2022-01-24},
journal = {ACM SIGOPS Operating Systems Review},
author = {Engler, D. R. and Kaashoek, M. F. and O'Toole, J.},
month = dec,
year = {1995},
pages = {251--266},
}
@inproceedings{lampson_hints_1983,
address = {New York, NY, USA},
series = {{SOSP} '83},
title = {Hints for computer system design},
isbn = {978-0-89791-115-3},
url = {https://doi.org/10.1145/800217.806614},
doi = {10.1145/800217.806614},
abstract = {Experience with the design and implementation of a number of computer systems, and study of many other systems, has led to some general hints for system design which are described here. They are illustrated by a number of examples, ranging from hardware such as the Alto and the Dorado to applications programs such as Bravo and Star.},
urldate = {2022-01-20},
booktitle = {Proceedings of the ninth {ACM} symposium on {Operating} systems principles},
publisher = {Association for Computing Machinery},
author = {Lampson, Butler W.},
month = oct,
year = {1983},
pages = {33--48},
}
@inproceedings{neumann_role_1969,
address = {New York, NY, USA},
series = {{SOSP} '69},
title = {The role of motherhood in the pop art of system programming},
isbn = {978-1-4503-7456-9},
url = {https://doi.org/10.1145/961053.961060},
doi = {10.1145/961053.961060},
abstract = {Numerous papers and conference talks have recently been devoted to the affirmation or reaffirmation of various common-sense principles of computer program design and implementation, particularly with respect to operating systems and to large subsystems such as language translators. These principles are nevertheless little observed in practice, often to the detriment of the resulting systems. This paper attempts to summarize the most significant principles, to evaluate their applicability in the real world of large multi-access systems, and to assess how they can be used more effectively.},
urldate = {2022-01-20},
booktitle = {Proceedings of the second symposium on {Operating} systems principles},
publisher = {Association for Computing Machinery},
author = {Neumann, Peter G.},
month = oct,
year = {1969},
pages = {13--18},
}
@inproceedings{needham_theory_1969,
address = {New York, NY, USA},
series = {{SOSP} '69},
title = {Theory and practice in operating system design},
isbn = {978-1-4503-7456-9},
url = {https://doi.org/10.1145/961053.961058},
doi = {10.1145/961053.961058},
abstract = {In designing an operating system one needs both theoretical insight and horse sense. Without the former, one designs an ad hoc mess; without the latter one designs an elephant in best Carrara marble (white, perfect, and immobile). We try in this paper to explore the provinces of the two needs, suggesting places where we think horse sense will always be needed, and some other places where better theoretical understanding than we now have would seem both possible and helpful.},
urldate = {2022-01-20},
booktitle = {Proceedings of the second symposium on {Operating} systems principles},
publisher = {Association for Computing Machinery},
author = {Needham, R. M. and Hartley, D. F.},
month = oct,
year = {1969},
pages = {8--12},
}
@article{ghosh_right_2020,
title = {Right buffer sizing matters: {Some} dynamical and statistical studies on {Compound} {TCP}},
volume = {139},
issn = {0166-5316},
shorttitle = {Right buffer sizing matters},
url = {https://www.sciencedirect.com/science/article/pii/S0166531620300158},
doi = {10.1016/j.peva.2020.102095},
abstract = {Large and unmanaged router buffers could lead to an increase in queuing delays in the Internet, which is a serious concern for network performance and quality of service. Our focus is to conduct a performance evaluation of Compound TCP (C-TCP), in a regime where the router buffer sizes are small (i.e., independent of the bandwidth-delay product), and the queue policy is Drop-Tail. In particular, we provide buffer sizing recommendations for high speed core routers fed by well multiplexed TCP controlled flows. For this, we consider two topologies: a single bottleneck and a multi-bottleneck topology, under different traffic scenarios. The first topology consists of a single bottleneck router, and the second consists of two distinct sets of TCP flows, regulated by two edge routers, feeding into a common core router. We focus on some key dynamical and statistical properties of the underlying system. From a dynamical perspective, we first develop fluid models. A local stability analysis for these models yields a key insight: buffer sizes need to be dimensioned carefully, and smaller buffers favour stability. We also highlight that larger Drop-Tail buffers, in addition to increasing latency, are prone to inducing limit cycles in the system dynamics. These limit cycles in turn induce synchronisation among the TCP flows, which then results in a loss of link utilisation. We then empirically analyse some statistical properties of the bottleneck queues. These statistical analyses serve to validate an important modelling assumption: that in the regime considered, each bottleneck queue may be reasonably well approximated as either an MM1B or an MD1B queue. We also highlight that smaller buffers, in addition to ensuring stability and low latency, would also yield reasonable system-wide performance, in terms of throughput and flow completion times.},
language = {en},
urldate = {2022-01-17},
journal = {Performance Evaluation},
author = {Ghosh, Debayani and Jagannathan, Krishna and Raina, Gaurav},
month = jun,
year = {2020},
keywords = {Buffer sizing, Compound TCP, Drop-tail, Hopf bifurcation, Stability},
pages = {102095},
}
@inproceedings{chen_understanding_2009,
address = {New York, NY, USA},
series = {{WREN} '09},
title = {Understanding {TCP} incast throughput collapse in datacenter networks},
isbn = {978-1-60558-443-0},
url = {https://doi.org/10.1145/1592681.1592693},
doi = {10.1145/1592681.1592693},
abstract = {TCP Throughput Collapse, also known as Incast, is a pathological behavior of TCP that results in gross under-utilization of link capacity in certain many-to-one communication patterns. This phenomenon has been observed by others in distributed storage, MapReduce and web-search workloads. In this paper we focus on understanding the dynamics of Incast. We use empirical data to reason about the dynamic system of simultaneously communicating TCP entities. We propose an analytical model to account for the observed Incast symptoms, identify contributory factors, and explore the efficacy of solutions proposed by us and by others.},
urldate = {2022-01-16},
booktitle = {Proceedings of the 1st {ACM} workshop on {Research} on enterprise networking},
publisher = {Association for Computing Machinery},
author = {Chen, Yanpei and Griffith, Rean and Liu, Junda and Katz, Randy H. and Joseph, Anthony D.},
month = aug,
year = {2009},
keywords = {TCP, incast, throughput collapse, unix},
pages = {73--82},
}
@inproceedings{key_user_1999,
title = {User {Policies} in a {Network} {Implementing} {Congestion} {Pricing}},
author = {Key, Peter B. and Massoulié, Laurent and Key, Peter B.},
year = {1999},
}
@article{noauthor_user_nodate,
title = {User {Policies} in a {Network} {Implementing} {Congestion} {Pricing} {\textbar} {Semantic} {Scholar}},
url = {https://www.semanticscholar.org/paper/User-Policies-in-a-Network-Implementing-Congestion-Key-Massouli%C3%A9/859cfebc43d18583a4f64564113a9fdd315cfaca},
abstract = {Semantic Scholar extracted view of \"User Policies in a Network Implementing Congestion Pricing\" by P. Key et al.},
language = {en},
urldate = {2022-01-16},
}
@article{crowcroft_differentiated_1998,
title = {Differentiated end-to-end {Internet} services using a weighted proportional fair sharing {TCP}},
volume = {28},
issn = {0146-4833},
url = {https://doi.org/10.1145/293927.293930},
doi = {10.1145/293927.293930},
abstract = {In this document we study the application of weighted proportional fairness to data flows in the Internet. We let the users set the weights of their connections in order to maximise the utility they get from the network. When combined with a pricing scheme where connections are billed by weight and time, such a system is known to maximise the total utility of the network. Our study case is a national Web cache server connected to long distance links. We propose two ways of weighting TCP connections by manipulating some parameters of the protocol and present results from simulations and prototypes. We finally discuss how proportional fairness could be used to implement an Internet with differentiated services.},
number = {3},
urldate = {2022-01-16},
journal = {ACM SIGCOMM Computer Communication Review},
author = {Crowcroft, Jon and Oechslin, Philippe},
month = jul,
year = {1998},
pages = {53--69},
}
@inproceedings{tsirantonakis_large-scale_2018,
address = {San Diego, CA},
title = {A {Large}-scale {Analysis} of {Content} {Modification} by {Open} {HTTP} {Proxies}},
isbn = {978-1-891562-49-5},
url = {https://www.ndss-symposium.org/wp-content/uploads/2018/02/ndss2018_04A-1_Tsirantonakis_paper.pdf},
doi = {10.14722/ndss.2018.23244},
abstract = {Open HTTP proxies offer a quick and convenient solution for routing web traffic towards a destination. In contrast to more elaborate relaying systems, such as anonymity networks or VPN services, users can freely connect to an open HTTP proxy without the need to install any special software. Therefore, open HTTP proxies are an attractive option for bypassing IPbased filters and geo-location restrictions, circumventing content blocking and censorship, and in general, hiding the clients IP address when accessing a web server. Nevertheless, the consequences of routing traffic through an untrusted third party can be severe, while the operating incentives of the thousands of publicly available HTTP proxies are questionable.},
language = {en},
urldate = {2021-10-18},
booktitle = {Proceedings 2018 {Network} and {Distributed} {System} {Security} {Symposium}},
publisher = {Internet Society},
author = {Tsirantonakis, Giorgos and Ilia, Panagiotis and Ioannidis, Sotiris and Athanasopoulos, Elias and Polychronakis, Michalis},
year = {2018},
note = {Test content in extra.},
}
@article{kelly_models_2000,
title = {Models for a selfmanaged {Internet}},
volume = {358},
url = {https://royalsocietypublishing.org/doi/abs/10.1098/rsta.2000.0651},
doi = {10.1098/rsta.2000.0651},
abstract = {This paper uses a variety of mathematical models to explore some of the consequences of rapidly growing communications capacity for the evolution of the Internet. It argues that queueing delays may become small in comparison with propagation delays, and that differentiation between traffic classes within the network may become redundant. Instead, a simple packet network may be able to support an arbitrarily differentiated and constantly evolving set of services, by conveying information on incipient congestion to intelligent endnodes, which themselves determine what should be their demands on the packet network.},
number = {1773},
urldate = {2022-01-07},
journal = {Philosophical Transactions of the Royal Society of London. Series A: Mathematical, Physical and Engineering Sciences},
author = {Kelly, Frank P.},
month = aug,
year = {2000},
note = {Publisher: Royal Society},
keywords = {explicit congestion notification, fairness, queues, shadow price, short transfers, stability},
pages = {2335--2348},
}
@inproceedings{handley_re-architecting_2017,
address = {New York, NY, USA},
series = {{SIGCOMM} '17},
title = {Re-architecting datacenter networks and stacks for low latency and high performance},
isbn = {978-1-4503-4653-5},
url = {https://doi.org/10.1145/3098822.3098825},
doi = {10.1145/3098822.3098825},
abstract = {Modern datacenter networks provide very high capacity via redundant Clos topologies and low switch latency, but transport protocols rarely deliver matching performance. We present NDP, a novel data-center transport architecture that achieves near-optimal completion times for short transfers and high flow throughput in a wide range of scenarios, including incast. NDP switch buffers are very shallow and when they fill the switches trim packets to headers and priority forward the headers. This gives receivers a full view of instantaneous demand from all senders, and is the basis for our novel, high-performance, multipath-aware transport protocol that can deal gracefully with massive incast events and prioritize traffic from different senders on RTT timescales. We implemented NDP in Linux hosts with DPDK, in a software switch, in a NetFPGA-based hardware switch, and in P4. We evaluate NDP's performance in our implementations and in large-scale simulations, simultaneously demonstrating support for very low-latency and high throughput.},
urldate = {2022-01-07},
booktitle = {Proceedings of the {Conference} of the {ACM} {Special} {Interest} {Group} on {Data} {Communication}},
publisher = {Association for Computing Machinery},
author = {Handley, Mark and Raiciu, Costin and Agache, Alexandru and Voinescu, Andrei and Moore, Andrew W. and Antichi, Gianni and Wójcik, Marcin},
month = aug,
year = {2017},
keywords = {Datacenters, Network Stacks, Transport Protocols},
pages = {29--42},
}
@misc{noauthor_network_nodate,
title = {Network namespaces to the {Internet} with veth and {NAT}},
url = {https://josephmuia.ca/2018-05-16-net-namespaces-veth-nat/},
urldate = {2021-12-18},
}
@misc{swetland_swetlandmkbox_2021,
title = {swetland/mkbox},
url = {https://github.com/swetland/mkbox},
abstract = {experiments in linux sandbox stuff},
urldate = {2021-12-13},
author = {Swetland, Brian},
month = nov,
year = {2021},
note = {original-date: 2014-04-27T04:46:18Z},
}
@misc{noauthor_control_nodate,
title = {Control {Group} {APIs} and {Delegation}},
url = {http://systemd.io/CGROUP_DELEGATION/},
urldate = {2021-12-13},
}
@misc{heo_control_2015,
title = {Control {Group} v2 — {The} {Linux} {Kernel} documentation},
url = {https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html},
urldate = {2021-12-13},
author = {Heo, Tejun},
month = oct,
year = {2015},
}
@misc{otwell_laravelframework_2013,
title = {laravel/framework},
copyright = {MIT},
url = {https://github.com/laravel/framework/blob/de69bb287c5017d1acb7d47a6db1dedf578036d6/src/Illuminate/Database/Migrations/Migration.php},
abstract = {The Laravel Framework.},
urldate = {2021-12-03},
publisher = {The Laravel Framework},
author = {Otwell, Taylor},
month = jan,
year = {2013},
note = {original-date: 2013-01-10T21:27:28Z},
}
@inproceedings{klimovic_flash_2016,
address = {New York, NY, USA},
series = {{EuroSys} '16},
title = {Flash storage disaggregation},
isbn = {978-1-4503-4240-7},
url = {https://doi.org/10.1145/2901318.2901337},
doi = {10.1145/2901318.2901337},
abstract = {PCIe-based Flash is commonly deployed to provide datacenter applications with high IO rates. However, its capacity and bandwidth are often underutilized as it is difficult to design servers with the right balance of CPU, memory and Flash resources over time and for multiple applications. This work examines Flash disaggregation as a way to deal with Flash overprovisioning. We tune remote access to Flash over commodity networks and analyze its impact on workloads sampled from real datacenter applications. We show that, while remote Flash access introduces a 20\% throughput drop at the application level, disaggregation allows us to make up for these overheads through resource-efficient scale-out. Hence, we show that Flash disaggregation allows scaling CPU and Flash resources independently in a cost effective manner. We use our analysis to draw conclusions about data and control plane issues in remote storage.},
urldate = {2021-12-03},
booktitle = {Proceedings of the {Eleventh} {European} {Conference} on {Computer} {Systems}},
publisher = {Association for Computing Machinery},
author = {Klimovic, Ana and Kozyrakis, Christos and Thereska, Eno and John, Binu and Kumar, Sanjeev},
month = apr,
year = {2016},
keywords = {datacenter, flash, network storage},
pages = {1--15},
}
@misc{the_tor_project_inc_about_nodate,
title = {{ABOUT} {TOR} {BROWSER}},
url = {https://tb-manual.torproject.org/about/},
urldate = {2021-10-30},
author = {The Tor Project, Inc.},
}
@article{bosshart_forwarding_2013,
title = {Forwarding metamorphosis: fast programmable match-action processing in hardware for {SDN}},
volume = {43},
issn = {0146-4833},
shorttitle = {Forwarding metamorphosis},
url = {https://doi.org/10.1145/2534169.2486011},
doi = {10.1145/2534169.2486011},
abstract = {In Software Defined Networking (SDN) the control plane is physically separate from the forwarding plane. Control software programs the forwarding plane (e.g., switches and routers) using an open interface, such as OpenFlow. This paper aims to overcomes two limitations in current switching chips and the OpenFlow protocol: i) current hardware switches are quite rigid, allowing ``Match-Action'' processing on only a fixed set of fields, and ii) the OpenFlow specification only defines a limited repertoire of packet processing actions. We propose the RMT (reconfigurable match tables) model, a new RISC-inspired pipelined architecture for switching chips, and we identify the essential minimal set of action primitives to specify how headers are processed in hardware. RMT allows the forwarding plane to be changed in the field without modifying hardware. As in OpenFlow, the programmer can specify multiple match tables of arbitrary width and depth, subject only to an overall resource limit, with each table configurable for matching on arbitrary fields. However, RMT allows the programmer to modify all header fields much more comprehensively than in OpenFlow. Our paper describes the design of a 64 port by 10 Gb/s switch chip implementing the RMT model. Our concrete design demonstrates, contrary to concerns within the community, that flexible OpenFlow hardware switch implementations are feasible at almost no additional cost or power.},
number = {4},
urldate = {2021-12-02},
journal = {ACM SIGCOMM Computer Communication Review},
author = {Bosshart, Pat and Gibb, Glen and Kim, Hun-Seok and Varghese, George and McKeown, Nick and Izzard, Martin and Mujica, Fernando and Horowitz, Mark},
month = aug,
year = {2013},
keywords = {reconfigurable match tables, rmt model, sdn},
pages = {99--110},
}
@inproceedings{tam_skylake-sp_2018,
title = {{SkyLake}-{SP}: {A} 14nm 28-{Core} xeon® processor},
shorttitle = {{SkyLake}-{SP}},
doi = {10.1109/ISSCC.2018.8310170},
abstract = {SkyLake-SP (Scalable Performance), code name SKX, is the next generation Xeon® server processor fabricated on the Intel® 14nm tri-gate CMOS technology with 11-metal layers [1,2]. The SKX processor family has three core-count configurations. Each SKX core is accompanied by 1MB of dedicated L2 (2nd level cache) and 1.375MB of non-exclusive L3 (3rd level cache). At its maximum configuration of 28 cores, the SKX processor supports 6 DDR4 channels (2666MT/s), 3×20-lanes UPI processor-to-processor links (10.4GT/s) and x48+4 PCIE links (8GT/s). SKX supports per-core power-performance optimization enabled by on-die integrated voltage regulators (FIVR) [3, 4]. A new 2-dimensional synchronous on-die MESH fabric interconnects all the on-die components. Fig. 2.1.1 shows the overall architecture of the SKX processor.},
booktitle = {2018 {IEEE} {International} {Solid} - {State} {Circuits} {Conference} - ({ISSCC})},
author = {Tam, Simon M. and Muljono, Harry and Huang, Min and Iyer, Sitaraman and Royneogi, Kalapi and Satti, Nagmohan and Qureshi, Rizwan and Chen, Wei and Wang, Tom and Hsieh, Hubert and Vora, Sujal and Wang, Eddie},
month = feb,
year = {2018},
note = {ISSN: 2376-8606},
keywords = {Clocks, Computer architecture, Fabrics, Integrated circuit interconnections, Program processors, Servers, Two dimensional displays},
pages = {34--36},
}
@article{chen_eyeriss_2019,
title = {Eyeriss v2: {A} {Flexible} {Accelerator} for {Emerging} {Deep} {Neural} {Networks} on {Mobile} {Devices}},
volume = {9},
issn = {2156-3365},
shorttitle = {Eyeriss v2},
doi = {10.1109/JETCAS.2019.2910232},
abstract = {A recent trend in deep neural network (DNN) development is to extend the reach of deep learning applications to platforms that are more resource and energy-constrained, e.g., mobile devices. These endeavors aim to reduce the DNN model size and improve the hardware processing efficiency and have resulted in DNNs that are much more compact in their structures and/or have high data sparsity. These compact or sparse models are different from the traditional large ones in that there is much more variation in their layer shapes and sizes and often require specialized hardware to exploit sparsity for performance improvement. Therefore, many DNN accelerators designed for large DNNs do not perform well on these models. In this paper, we present Eyeriss v2, a DNN accelerator architecture designed for running compact and sparse DNNs. To deal with the widely varying layer shapes and sizes, it introduces a highly flexible on-chip network, called hierarchical mesh, that can adapt to the different amounts of data reuse and bandwidth requirements of different data types, which improves the utilization of the computation resources. Furthermore, Eyeriss v2 can process sparse data directly in the compressed domain for both weights and activations and therefore is able to improve both processing speed and energy efficiency with sparse models. Overall, with sparse MobileNet, Eyeriss v2 in a 65-nm CMOS process achieves a throughput of 1470.6 inferences/s and 2560.3 inferences/J at a batch size of 1, which is 12.6× faster and 2.5× more energyefficient than the original Eyeriss running MobileNet.},
number = {2},
journal = {IEEE Journal on Emerging and Selected Topics in Circuits and Systems},
author = {Chen, Yu-Hsin and Yang, Tien-Ju and Emer, Joel and Sze, Vivienne},
month = jun,
year = {2019},
note = {Conference Name: IEEE Journal on Emerging and Selected Topics in Circuits and Systems},
keywords = {Arrays, Bandwidth, Deep neural network accelerators, Hardware, Mobile handsets, Parallel processing, Shape, dataflow processing, deep learning, energy-efficient accelerators, spatial architecture},
pages = {292--308},
}
@article{chen_eyeriss_2016,
title = {Eyeriss: a spatial architecture for energy-efficient dataflow for convolutional neural networks},
volume = {44},
issn = {0163-5964},
shorttitle = {Eyeriss},
url = {https://doi.org/10.1145/3007787.3001177},
doi = {10.1145/3007787.3001177},
abstract = {Deep convolutional neural networks (CNNs) are widely used in modern AI systems for their superior accuracy but at the cost of high computational complexity. The complexity comes from the need to simultaneously process hundreds of filters and channels in the high-dimensional convolutions, which involve a significant amount of data movement. Although highly-parallel compute paradigms, such as SIMD/SIMT, effectively address the computation requirement to achieve high throughput, energy consumption still remains high as data movement can be more expensive than computation. Accordingly, finding a dataflow that supports parallel processing with minimal data movement cost is crucial to achieving energy-efficient CNN processing without compromising accuracy. In this paper, we present a novel dataflow, called row-stationary (RS), that minimizes data movement energy consumption on a spatial architecture. This is realized by exploiting local data reuse of filter weights and feature map pixels, i.e., activations, in the high-dimensional convolutions, and minimizing data movement of partial sum accumulations. Unlike dataflows used in existing designs, which only reduce certain types of data movement, the proposed RS dataflow can adapt to different CNN shape configurations and reduces all types of data movement through maximally utilizing the processing engine (PE) local storage, direct inter-PE communication and spatial parallelism. To evaluate the energy efficiency of the different dataflows, we propose an analysis framework that compares energy cost under the same hardware area and processing parallelism constraints. Experiments using the CNN configurations of AlexNet show that the proposed RS dataflow is more energy efficient than existing dataflows in both convolutional (1.4× to 2.5×) and fully-connected layers (at least 1.3× for batch size larger than 16). The RS dataflow has also been demonstrated on a fabricated chip, which verifies our energy analysis.},
number = {3},
urldate = {2021-11-24},
journal = {ACM SIGARCH Computer Architecture News},
author = {Chen, Yu-Hsin and Emer, Joel and Sze, Vivienne},
month = jun,
year = {2016},
pages = {367--379},
}
@inproceedings{saur_safe_2016,
address = {New York, NY, USA},
series = {{SOSR} '16},
title = {Safe and {Flexible} {Controller} {Upgrades} for {SDNs}},
isbn = {978-1-4503-4211-7},
url = {https://doi.org/10.1145/2890955.2890966},
doi = {10.1145/2890955.2890966},
abstract = {SDN controllers must be periodically upgraded to add features, improve performance, and fix bugs, but current techniques for implementing dynamic updates---i.e., without disrupting ongoing network functions---are inadequate. Simply halting the old controller and bringing up the new one can cause state to be lost, leading to incorrect behavior. For example, if the state represents flows blacklisted by a firewall, then traffic that should be blocked may be allowed to pass through. Techniques based on record and replay can reconstruct controller state automatically, but they are expensive to deploy and do not work in all scenarios. This paper presents a new approach to implementing dynamic updates for SDN controllers. We present the design and implementation of a new controller platform called Morpheus that uses explicit state transfer to implement dynamic updates. Morpheus enables programmers to directly initialize the upgraded controller's state as a function of its existing state, using a domain-specific language that is designed to be easy to use. Morpheus also offers a distributed protocol for safely deploying updates across multiple nodes. Experiments confirm that Morpheus provides correct behavior and good performance.},
urldate = {2021-11-20},
booktitle = {Proceedings of the {Symposium} on {SDN} {Research}},
publisher = {Association for Computing Machinery},
author = {Saur, Karla and Collard, Joseph and Foster, Nate and Guha, Arjun and Vanbever, Laurent and Hicks, Michael},
month = mar,
year = {2016},
keywords = {Dynamic Software Updating, Software-Defined Network, r02-essay-2},
pages = {1--12},
}
@inproceedings{chole_drmt_2017,
address = {New York, NY, USA},
series = {{SIGCOMM} '17},
title = {{dRMT}: {Disaggregated} {Programmable} {Switching}},
isbn = {978-1-4503-4653-5},
shorttitle = {{dRMT}},
url = {https://doi.org/10.1145/3098822.3098823},
doi = {10.1145/3098822.3098823},
abstract = {We present dRMT (disaggregated Reconfigurable Match-Action Table), a new architecture for programmable switches. dRMT overcomes two important restrictions of RMT, the predominant pipeline-based architecture for programmable switches: (1) table memory is local to an RMT pipeline stage, implying that memory not used by one stage cannot be reclaimed by another, and (2) RMT is hardwired to always sequentially execute matches followed by actions as packets traverse pipeline stages. We show that these restrictions make it difficult to execute programs efficiently on RMT. dRMT resolves both issues by disaggregating the memory and compute resources of a programmable switch. Specifically, dRMT moves table memories out of pipeline stages and into a centralized pool that is accessible through a crossbar. In addition, dRMT replaces RMT's pipeline stages with a cluster of processors that can execute match and action operations in any order. We show how to schedule a P4 program on dRMT at compile time to guarantee deterministic throughput and latency. We also present a hardware design for dRMT and analyze its feasibility and chip area. Our results show that dRMT can run programs at line rate with fewer processors compared to RMT, and avoids performance cliffs when there are not enough processors to run a program at line rate. dRMT's hardware design incurs a modest increase in chip area relative to RMT, mainly due to the crossbar.},
urldate = {2021-11-20},
booktitle = {Proceedings of the {Conference} of the {ACM} {Special} {Interest} {Group} on {Data} {Communication}},
publisher = {Association for Computing Machinery},
author = {Chole, Sharad and Fingerhut, Andy and Ma, Sha and Sivaraman, Anirudh and Vargaftik, Shay and Berger, Alon and Mendelson, Gal and Alizadeh, Mohammad and Chuang, Shang-Tse and Keslassy, Isaac and Orda, Ariel and Edsall, Tom},
month = aug,
year = {2017},
keywords = {Programmable switching, RMT, disagreggation, packet processing, r02-essay-2},
pages = {1--14},
}
@inproceedings{zhang_composable_2018,
title = {Composable {Building} {Blocks} to {Open} up {Processor} {Design}},
doi = {10.1109/MICRO.2018.00015},
abstract = {We present a framework called Composable Modular Design (CMD) to facilitate the design of out-of-order (OOO) processors. In CMD, (1) The interface methods of modules provide instantaneous access and perform atomic updates to the state elements inside the module; (2) Every interface method is guarded, i.e., it cannot be applied unless it is ready; and (3) Modules are composed together by atomic rules which call interface methods of different modules. A rule either successfully updates the state of all the called modules or it does nothing. CMD designs are compiled into RTL which can be run on FPGAs or synthesized using standard ASIC design flows. The atomicity properties of interfaces in CMD ensures composability when selected modules are refined selectively. We show the efficacy of CMD by building a parameterized out-of-order RISC-V processor which boots Linux and runs on FPGAs at 25 MHz to 40 MHz. We also synthesized several variants of it in a 32 nm technology to run at 1 GHz to 1.1 GHz. Performance evaluation shows that our processor beats in-order processors in terms of IPC but will require more architectural work to compete with wider superscalar commercial ARM processors. Modules designed under the CMD framework (e.g., ROB, reservation stations, load store unit) can be used and refined by other implementations. We believe that this realistic framework can revolutionize architectural research and practice as the library of reusable components grows.},
booktitle = {2018 51st {Annual} {IEEE}/{ACM} {International} {Symposium} on {Microarchitecture} ({MICRO})},
author = {Zhang, Sizhuo and Wright, Andrew and Bourgeat, Thomas and Arvind, Arvind},
month = oct,
year = {2018},
keywords = {Bluespec, Hardware, Microarchitecture, Multicore processing, Out of order, RISC V, Rockets, Timing, composability, guarded atomic actions, modularity, out of order processor, processor design},
pages = {68--81},
}
@inproceedings{kocher_spectre_2019,
title = {Spectre {Attacks}: {Exploiting} {Speculative} {Execution}},
shorttitle = {Spectre {Attacks}},
doi = {10.1109/SP.2019.00002},
abstract = {Modern processors use branch prediction and speculative execution to maximize performance. For example, if the destination of a branch depends on a memory value that is in the process of being read, CPUs will try to guess the destination and attempt to execute ahead. When the memory value finally arrives, the CPU either discards or commits the speculative computation. Speculative logic is unfaithful in how it executes, can access the victim's memory and registers, and can perform operations with measurable side effects. Spectre attacks involve inducing a victim to speculatively perform operations that would not occur during correct program execution and which leak the victim's confidential information via a side channel to the adversary. This paper describes practical attacks that combine methodology from side channel attacks, fault attacks, and return-oriented programming that can read arbitrary memory from the victim's process. More broadly, the paper shows that speculative execution implementations violate the security assumptions underpinning numerous software security mechanisms, including operating system process separation, containerization, just-in-time (JIT) compilation, and countermeasures to cache timing and side-channel attacks. These attacks represent a serious threat to actual systems since vulnerable speculative execution capabilities are found in microprocessors from Intel, AMD, and ARM that are used in billions of devices. While makeshift processor-specific countermeasures are possible in some cases, sound solutions will require fixes to processor designs as well as updates to instruction set architectures (ISAs) to give hardware architects and software developers a common understanding as to what computation state CPU implementations are (and are not) permitted to leak.},
booktitle = {2019 {IEEE} {Symposium} on {Security} and {Privacy} ({SP})},
author = {Kocher, Paul and Horn, Jann and Fogh, Anders and Genkin, Daniel and Gruss, Daniel and Haas, Werner and Hamburg, Mike and Lipp, Moritz and Mangard, Stefan and Prescher, Thomas and Schwarz, Michael and Yarom, Yuval},
month = may,
year = {2019},
note = {ISSN: 2375-1207},
keywords = {Arrays, Hardware, Microarchitecture, Program processors, Registers, Side-channel attacks, Spectre, Transient analysis, microarchitectural-attack, microarchitecture-security, r265-week-7, speculative-execution},
pages = {1--19},
}
@inproceedings{bourgeat_mi6_2019,
address = {New York, NY, USA},
series = {{MICRO} '52},
title = {{MI6}: {Secure} {Enclaves} in a {Speculative} {Out}-of-{Order} {Processor}},
isbn = {978-1-4503-6938-1},
shorttitle = {{MI6}},
url = {https://doi.org/10.1145/3352460.3358310},
doi = {10.1145/3352460.3358310},
abstract = {Recent attacks have broken process isolation by exploiting microarchitectural side channels that allow indirect access to shared microarchitectural state. Enclaves strengthen the process abstraction to restore isolation guarantees. We propose MI6, an aggressively speculative out-of-order processor capable of providing secure enclaves under a threat model that includes an untrusted OS and an attacker capable of mounting any software attack currently considered practical, including those utilizing control flow mis-speculation. MI6 is inspired by Sanctum [16] and extends its isolation guarantee to more realistic memory hierarchy. It also introduces a purge instruction, which is used only when a secure process is (de)scheduled, and implements it for a complex processor microarchitecture. We model the performance impact of enclaves in MI6 through FPGA emulation on AWS F1 FPGAs by running SPEC CINT2006 benchmarks as enclaves within an untrusted Linux OS. Security comes at the cost of approximately 16.4\% average slowdown for protected programs.},
urldate = {2021-11-17},
booktitle = {Proceedings of the 52nd {Annual} {IEEE}/{ACM} {International} {Symposium} on {Microarchitecture}},
publisher = {Association for Computing Machinery},
author = {Bourgeat, Thomas and Lebedev, Ilia and Wright, Andrew and Zhang, Sizhuo and Arvind and Devadas, Srinivas},
month = oct,
year = {2019},
keywords = {architectural isolation, microarchitectural isolation, r265-week-7, secure processors},
pages = {42--56},
}
@inproceedings{watson_capsicum_2010,
title = {Capsicum: {Practical} {Capabilities} for {UNIX}.},
volume = {46},
booktitle = {{USENIX} {Security} {Symposium}},
author = {Watson, Robert NM and Anderson, Jonathan and Laurie, Ben and Kennaway, Kris},
year = {2010},
pages = {2},
}
@inproceedings{dolan_concurrent_2018,
address = {Cham},
series = {Lecture {Notes} in {Computer} {Science}},
title = {Concurrent {System} {Programming} with {Effect} {Handlers}},
isbn = {978-3-319-89719-6},
doi = {10.1007/978-3-319-89719-6_6},
abstract = {Algebraic effects and their handlers have been steadily gaining attention as a programming language feature for composably expressing user-defined computational effects. While several prototype implementations of languages incorporating algebraic effects exist, Multicore OCaml incorporates effect handlers as the primary means of expressing concurrency in the language. In this paper, we make the observation that effect handlers can elegantly express particularly difficult programs that combine system programming and concurrency without compromising performance. Our experimental results on a highly concurrent and scalable web server demonstrate that effect handlers perform on par with highly optimised monadic concurrency libraries, while retaining the simplicity of direct-style code.},
language = {en},
booktitle = {Trends in {Functional} {Programming}},
publisher = {Springer International Publishing},
author = {Dolan, Stephen and Eliopoulos, Spiros and Hillerström, Daniel and Madhavapeddy, Anil and Sivaramakrishnan, K. C. and White, Leo},
editor = {Wang, Meng and Owens, Scott},
year = {2018},
keywords = {Algebraic Effects, Default Handler, Event Handler, Programming Concurrent Systems, User-level Threads},
pages = {98--117},
}
@inproceedings{sivaramakrishnan_retrofitting_2021,
address = {New York, NY, USA},
series = {{PLDI} 2021},
title = {Retrofitting effect handlers onto {OCaml}},
isbn = {978-1-4503-8391-2},
url = {https://doi.org/10.1145/3453483.3454039},
doi = {10.1145/3453483.3454039},
abstract = {Effect handlers have been gathering momentum as a mechanism for modular programming with user-defined effects. Effect handlers allow for non-local control flow mechanisms such as generators, async/await, lightweight threads and coroutines to be composably expressed. We present a design and evaluate a full-fledged efficient implementation of effect handlers for OCaml, an industrial-strength multi-paradigm programming language. Our implementation strives to maintain the backwards compatibility and performance profile of existing OCaml code. Retrofitting effect handlers onto OCaml is challenging since OCaml does not currently have any non-local control flow mechanisms other than exceptions. Our implementation of effect handlers for OCaml: (i) imposes a mean 1\% overhead on a comprehensive macro benchmark suite that does not use effect handlers; (ii) remains compatible with program analysis tools that inspect the stack; and (iii) is efficient for new code that makes use of effect handlers.},
urldate = {2021-11-16},
booktitle = {Proceedings of the 42nd {ACM} {SIGPLAN} {International} {Conference} on {Programming} {Language} {Design} and {Implementation}},
publisher = {Association for Computing Machinery},
author = {Sivaramakrishnan, KC and Dolan, Stephen and White, Leo and Kelly, Tom and Jaffer, Sadiq and Madhavapeddy, Anil},
month = jun,
year = {2021},
keywords = {Backtraces, Backwards compatibility, Continuations, Effect handlers, Fibers},
pages = {206--221},
}
@misc{ghedini_http3_2019,
title = {{HTTP}/3: the past, the present, and the future},
shorttitle = {{HTTP}/3},
url = {https://blog.cloudflare.com/http3-the-past-present-and-future/},
abstract = {We are now happy to announce that QUIC and HTTP/3 support is available on the Cloudflare edge network. Were excited to be joined in this announcement by Google Chrome and Mozilla Firefox, two of the leading browser vendors and partners in our effort to make the web faster and more reliable for all.},
language = {en},
urldate = {2021-11-16},
journal = {The Cloudflare Blog},
author = {Ghedini, Alessandro and Lalkaka, Rustam},
month = sep,
year = {2019},
}
@misc{blog_quic_2015,
title = {A {QUIC} update on {Google}s experimental transport},
url = {https://blog.chromium.org/2015/04/a-quic-update-on-googles-experimental.html},
urldate = {2021-11-16},
publisher = {Apr},
author = {Blog, Chromium},
year = {2015},
}
@misc{iyengar_quic_2021,
title = {{QUIC}: {A} {UDP}-{Based} {Multiplexed} and {Secure} {Transport}},
url = {https://rfc-editor.org/rfc/rfc9000.txt},
abstract = {This document defines the core of the QUIC transport protocol. QUIC provides applications with flow-controlled streams for structured communication, low-latency connection establishment, and network path migration. QUIC includes security measures that ensure confidentiality, integrity, and availability in a range of deployment circumstances. Accompanying documents describe the integration of TLS for key negotiation, loss detection, and an exemplary congestion control algorithm.},
publisher = {RFC Editor},
author = {Iyengar, Jana and Thomson, Martin},
month = may,
year = {2021},
doi = {10.17487/RFC9000},
note = {Issue: 9000
Num Pages: 151
Series: Request for Comments
Published: RFC 9000},
}
@misc{belshe_hypertext_2015,
title = {Hypertext {Transfer} {Protocol} {Version} 2 ({HTTP}/2)},
url = {https://rfc-editor.org/rfc/rfc7540.txt},
abstract = {This specification describes an optimized expression of the semantics of the Hypertext Transfer Protocol (HTTP), referred to as HTTP version 2 (HTTP/2). HTTP/2 enables a more efficient use of network resources and a reduced perception of latency by introducing header field compression and allowing multiple concurrent exchanges on the same connection. It also introduces unsolicited push of representations from servers to clients. This specification is an alternative to, but does not obsolete, the HTTP/1.1 message syntax. HTTP's existing semantics remain unchanged.},
publisher = {RFC Editor},
author = {Belshe, Mike and Peon, Roberto and Thomson, Martin},
month = may,
year = {2015},
doi = {10.17487/RFC7540},
note = {Issue: 7540
Num Pages: 96
Series: Request for Comments
Published: RFC 7540},
}
@misc{nielsen_hypertext_1996,
title = {Hypertext {Transfer} {Protocol} {HTTP}/1.0},
url = {https://rfc-editor.org/rfc/rfc1945.txt},
abstract = {The Hypertext Transfer Protocol (HTTP) is an application-level protocol with the lightness and speed necessary for distributed, collaborative, hypermedia information systems. This memo provides information for the Internet community. This memo does not specify an Internet standard of any kind.},
publisher = {RFC Editor},
author = {Nielsen, Henrik and Fielding, Roy T. and Berners-Lee, Tim},
month = may,
year = {1996},
doi = {10.17487/RFC1945},
note = {Issue: 1945
Num Pages: 60
Series: Request for Comments
Published: RFC 1945},
}
@misc{nielsen_hypertext_1999,
title = {Hypertext {Transfer} {Protocol} {HTTP}/1.1},
url = {https://rfc-editor.org/rfc/rfc2616.txt},
abstract = {HTTP has been in use by the World-Wide Web global information initiative since 1990. This specification defines the protocol referred to as "HTTP/1.1", and is an update to RFC 2068. [STANDARDS-TRACK]},
publisher = {RFC Editor},
author = {Nielsen, Henrik and Mogul, Jeffrey and Masinter, Larry M. and Fielding, Roy T. and Gettys, Jim and Leach, Paul J. and Berners-Lee, Tim},
month = jun,
year = {1999},
doi = {10.17487/RFC2616},
note = {Issue: 2616
Num Pages: 176
Series: Request for Comments
Published: RFC 2616},
}
@techreport{bishop_hypertext_2021,
type = {Internet-{Draft}},
title = {Hypertext {Transfer} {Protocol} {Version} 3 ({HTTP}/3)},
url = {https://datatracker.ietf.org/doc/html/draft-ietf-quic-http-34},
abstract = {The QUIC transport protocol has several features that are desirable in a transport for HTTP, such as stream multiplexing, per-stream flow control, and low-latency connection establishment. This document describes a mapping of HTTP semantics over QUIC. This document also identifies HTTP/2 features that are subsumed by QUIC, and describes how HTTP/2 extensions can be ported to HTTP/3. DO NOT DEPLOY THIS VERSION OF HTTP DO NOT DEPLOY THIS VERSION OF HTTP/3 UNTIL IT IS IN AN RFC. This version is still a work in progress. For trial deployments, please use earlier versions. Note to Readers Discussion of this draft takes place on the QUIC working group mailing list (quic@ietf.org), which is archived at https://mailarchive.ietf.org/arch/search/?email\_list=quic. Working Group information can be found at https://github.com/quicwg; source code and issues list for this draft can be found at https://github.com/quicwg/base-drafts/labels/-http.},
number = {draft-ietf-quic-http-34},
institution = {Internet Engineering Task Force},
author = {Bishop, Mike},
month = feb,
year = {2021},
note = {Backup Publisher: Internet Engineering Task Force
Num Pages: 75},
}
@article{erman_towards_2015,
title = {Towards a {SPDY}ier {Mobile} {Web}?},
volume = {23},
issn = {1558-2566},
doi = {10.1109/TNET.2015.2462737},
abstract = {Despite its widespread adoption and popularity, the Hypertext Transfer Protocol (HTTP) suffers from fundamental performance limitations. SPDY, a recently proposed alternative to HTTP, tries to address many of the limitations of HTTP (e.g., multiple connections, setup latency). In this paper, we perform a detailed measurement study to understand the benefits of using SPDY over cellular networks. Through careful measurements conducted over 4 months, we provide a detailed analysis of the performance of HTTP and SPDY, how they interact with the various layers, and their implications on Web design. Our results show that unlike in wired and 802.11 networks, SPDY does not clearly outperform HTTP over cellular networks. We identify negative interactions between the protocols used for Web access (HTTP/SPDY over TCP) and cellular radio resource management as the underlying cause. Overall performance suffers when devices go through a cellular radio state promotion after an idle period, and the consequent increase in latency. This impacts SPDY more because of the use of a single TCP connection. We conclude that a viable solution has to account for these unique cross-layer dependencies to achieve improved Web performance over cellular networks.},
number = {6},
journal = {IEEE/ACM Transactions on Networking},
author = {Erman, Jeffrey and Gopalakrishnan, Vijay and Jana, Rittwik and Ramakrishnan, Kadangode K.},
month = dec,
year = {2015},
note = {Conference Name: IEEE/ACM Transactions on Networking},
keywords = {Browsers, Cellular networks, Mobile communication, Pipeline processing, Protocols, SPDY, Servers, Web pages, mobile Web performance, wireless protocol},
pages = {2010--2023},
}
@inproceedings{kakhki_taking_2017,
address = {New York, NY, USA},
series = {{IMC} '17},
title = {Taking a long look at {QUIC}: an approach for rigorous evaluation of rapidly evolving transport protocols},
isbn = {978-1-4503-5118-8},
shorttitle = {Taking a long look at {QUIC}},
url = {https://doi.org/10.1145/3131365.3131368},
doi = {10.1145/3131365.3131368},
abstract = {Google's QUIC protocol, which implements TCP-like properties at the application layer atop a UDP transport, is now used by the vast majority of Chrome clients accessing Google properties but has no formal state machine specification, limited analysis, and ad-hoc evaluations based on snapshots of the protocol implementation in a small number of environments. Further frustrating attempts to evaluate QUIC is the fact that the protocol is under rapid development, with extensive rewriting of the protocol occurring over the scale of months, making individual studies of the protocol obsolete before publication. Given this unique scenario, there is a need for alternative techniques for understanding and evaluating QUIC when compared with previous transport-layer protocols. First, we develop an approach that allows us to conduct analysis across multiple versions of QUIC to understand how code changes impact protocol effectiveness. Next, we instrument the source code to infer QUIC's state machine from execution traces. With this model, we run QUIC in a large number of environments that include desktop and mobile, wired and wireless environments and use the state machine to understand differences in transport- and application-layer performance across multiple versions of QUIC and in different environments. QUIC generally outperforms TCP, but we also identified performance issues related to window sizes, re-ordered packets, and multiplexing large number of small objects; further, we identify that QUIC's performance diminishes on mobile devices and over cellular networks.},
urldate = {2021-11-16},
booktitle = {Proceedings of the 2017 {Internet} {Measurement} {Conference}},
publisher = {Association for Computing Machinery},
author = {Kakhki, Arash Molavi and Jero, Samuel and Choffnes, David and Nita-Rotaru, Cristina and Mislove, Alan},
month = nov,
year = {2017},
keywords = {QUIC, transport-layer performance},
pages = {290--303},
}
@inproceedings{lee_preventing_2015,
address = {San Diego, CA},
title = {Preventing {Use}-after-free with {Dangling} {Pointers} {Nullification}},
isbn = {978-1-891562-38-9},
url = {https://www.ndss-symposium.org/ndss2015/ndss-2015-programme/preventing-use-after-free-dangling-pointers-nullification/},
doi = {10.14722/ndss.2015.23238},
abstract = {Many system components and network applications are written in languages that are prone to memory corruption vulnerabilities. There have been countless cases where simple mistakes by developers resulted in memory corruption vulnerabilities and consequently security exploits. While there have been tremendous research efforts to mitigate these vulnerabilities, useafter-free still remains one of the most critical and popular attack vectors because existing proposals have not adequately addressed the challenging program analysis and runtime performance issues.},
language = {en},
urldate = {2021-11-11},
booktitle = {Proceedings 2015 {Network} and {Distributed} {System} {Security} {Symposium}},
publisher = {Internet Society},
author = {Lee, Byoungyoung and Song, Chengyu and Jang, Yeongjin and Wang, Tielei and Kim, Taesoo and Lu, Long and Lee, Wenke},
year = {2015},
}
@misc{go_authors_runtime_2021,
title = {runtime package - runtime - pkg.go.dev},
url = {https://pkg.go.dev/runtime},
abstract = {Package runtime contains operations that interact with Go's runtime system, such as functions to control goroutines. It also includes the low-level type information used by the reflect package; see reflect's documentation for the programmable interface to the run-time type system.},
urldate = {2021-11-10},
author = {Go authors},
month = nov,
year = {2021},
}
@misc{oracle_java_2018,
title = {Java {Platform}, {Standard} {Edition} {HotSpot} {Virtual} {Machine} {Garbage} {Collection} {Tuning} {Guide} {Release} 11},
copyright = {Copyright © 2014, 2018, Oracle and/or its affiliates. All rights reserved.},
url = {https://docs.oracle.com/javase/10/gctuning/factors-affecting-garbage-collection-performance.htm#JSGCT-GUID-5508674B-F32D-4B02-9002-D0D8C7CDDC75},
abstract = {This guide describes the garbage collection methods included in the Java HotSpot Virtual Machine (Java HotSpot VM) and helps you determine which one is the best for your needs.},
language = {en-US},
urldate = {2021-11-10},
author = {Oracle},
month = sep,
year = {2018},
}
@inproceedings{grgic_comparison_2018,
title = {Comparison of garbage collectors in {Java} programming language},
doi = {10.23919/MIPRO.2018.8400277},
abstract = {Considering the need for continuous and uninterrupted service of modern software applications, it is valuable to analyze how garbage collection (GC) algorithms are handling memory challenges. Widely adopted general-purpose programming languages, like Java, represent an inevitable foundation for many modern application developments. In Java Platform, Standard Edition, and accompanying Java Virtual Machine (JVM), several GCs could be used. In the latest version 9.0.1 of Java SE Development Kit (JDK) default GC was changed to Garbage-First (G1) GC, now becoming widely adopted in addition to previously used Parallel GC and Concurrent Mark \& Sweep (CMS) GC. Since GC is a vital part of the JVM, changes and upgrades to its implementation, which reflect upon performance results, are properties worth exploring. Using benchmarks to create non-trivial memory pressures, and with extensive data monitoring, this paper analyzes insights gathered about critical performance factors across several GC algorithms. With the evaluation of benchmark elements, such as object allocations from young area to old area and the duration of the collection time, it was possible to compare GC behavior and assess the overall memory management. This paper presents our preliminary research performed in an academic environment on several benchmark cases and our conclusion about it.},
booktitle = {2018 41st {International} {Convention} on {Information} and {Communication} {Technology}, {Electronics} and {Microelectronics} ({MIPRO})},
author = {Grgic, H. and Mihaljević, B. and Radovan, A.},
month = may,
year = {2018},
keywords = {Benchmark testing, G1, Garbage Collector, Garbage-First, Java, Memory Management, Memory management, Software, Software algorithms, Standards},
pages = {1539--1544},
}
@inproceedings{davis_cheriabi_2019,
address = {New York, NY, USA},
series = {{ASPLOS} '19},
title = {{CheriABI}: {Enforcing} {Valid} {Pointer} {Provenance} and {Minimizing} {Pointer} {Privilege} in the {POSIX} {C} {Run}-time {Environment}},
isbn = {978-1-4503-6240-5},
shorttitle = {{CheriABI}},
url = {https://doi.org/10.1145/3297858.3304042},
doi = {10.1145/3297858.3304042},
abstract = {The CHERI architecture allows pointers to be implemented as capabilities (rather than integer virtual addresses) in a manner that is compatible with, and strengthens, the semantics of the C language. In addition to the spatial protections offered by conventional fat pointers, CHERI capabilities offer strong integrity, enforced provenance validity, and access monotonicity. The stronger guarantees of these architectural capabilities must be reconciled with the real-world behavior of operating systems, run-time environments, and applications. When the process model, user-kernel interactions, dynamic linking, and memory management are all considered, we observe that simple derivation of architectural capabilities is insufficient to describe appropriate access to memory. We bridge this conceptual gap with a notional abstract capability that describes the accesses that should be allowed at a given point in execution, whether in the kernel or userspace. To investigate this notion at scale, we describe the first adaptation of a full C-language operating system (FreeBSD) with an enterprise database (PostgreSQL) for complete spatial and referential memory safety. We show that awareness of abstract capabilities, coupled with CHERI architectural capabilities, can provide more complete protection, strong compatibility, and acceptable performance overhead compared with the pre-CHERI baseline and software-only approaches. Our observations also have potentially significant implications for other mitigation techniques.},
urldate = {2021-11-10},
booktitle = {Proceedings of the {Twenty}-{Fourth} {International} {Conference} on {Architectural} {Support} for {Programming} {Languages} and {Operating} {Systems}},
publisher = {Association for Computing Machinery},
author = {Davis, Brooks and Watson, Robert N. M. and Richardson, Alexander and Neumann, Peter G. and Moore, Simon W. and Baldwin, John and Chisnall, David and Clarke, Jessica and Filardo, Nathaniel Wesley and Gudka, Khilan and Joannou, Alexandre and Laurie, Ben and Markettos, A. Theodore and Maste, J. Edward and Mazzinghi, Alfredo and Napierala, Edward Tomasz and Norton, Robert M. and Roe, Michael and Sewell, Peter and Son, Stacey and Woodruff, Jonathan},
month = apr,
year = {2019},
keywords = {cheri, hardware, operating systems, security},
pages = {379--393},
}
@inproceedings{wesley_filardo_cornucopia_2020,
title = {Cornucopia: {Temporal} {Safety} for {CHERI} {Heaps}},
shorttitle = {Cornucopia},
doi = {10.1109/SP40000.2020.00098},
abstract = {Use-after-free violations of temporal memory safety continue to plague software systems, underpinning many high-impact exploits. The CHERI capability system shows great promise in achieving C and C++ language spatial memory safety, preventing out-of-bounds accesses. Enforcing language-level temporal safety on CHERI requires capability revocation, traditionally achieved either via table lookups (avoided for performance in the CHERI design) or by identifying capabilities in memory to revoke them (similar to a garbage-collector sweep). CHERIvoke, a prior feasibility study, suggested that CHERI's tagged capabilities could make this latter strategy viable, but modeled only architectural limits and did not consider the full implementation or evaluation of the approach.Cornucopia is a lightweight capability revocation system for CHERI that implements non-probabilistic C/C++ temporal memory safety for standard heap allocations. It extends the CheriBSD virtual-memory subsystem to track capability flow through memory and provides a concurrent kernel-resident revocation service that is amenable to multi-processor and hardware acceleration. We demonstrate an average overhead of less than 2\% and a worst-case of 8.9\% for concurrent revocation on compatible SPEC CPU2006 benchmarks on a multi-core CHERI CPU on FPGA, and we validate Cornucopia against the Juliet test suite's corpus of temporally unsafe programs. We test its compatibility with a large corpus of C programs by using a revoking allocator as the system allocator while booting multi-user CheriBSD. Cornucopia is a viable strategy for always-on temporal heap memory safety, suitable for production environments.},
booktitle = {2020 {IEEE} {Symposium} on {Security} and {Privacy} ({SP})},
author = {Wesley Filardo, Nathaniel and Gutstein, Brett F. and Woodruff, Jonathan and Ainsworth, Sam and Paul-Trifu, Lucian and Davis, Brooks and Xia, Hongyan and Tomasz Napierala, Edward and Richardson, Alexander and Baldwin, John and Chisnall, David and Clarke, Jessica and Gudka, Khilan and Joannou, Alexandre and Theodore Markettos, A. and Mazzinghi, Alfredo and Norton, Robert M. and Roe, Michael and Sewell, Peter and Son, Stacey and Jones, Timothy M. and Moore, Simon W. and Neumann, Peter G. and Watson, Robert N. M.},
month = may,
year = {2020},
note = {ISSN: 2375-1207},
keywords = {Integrated circuits, Licenses, Privacy, RNA, Security},
pages = {608--625},
}
@inproceedings{joannou_efficient_2017,
title = {Efficient {Tagged} {Memory}},
doi = {10.1109/ICCD.2017.112},
abstract = {We characterize the cache behavior of an in-memory tag table and demonstrate that an optimized implementation can typically achieve a near-zero memory traffic overhead. Both industry and academia have repeatedly demonstrated tagged memory as a key mechanism to enable enforcement of powerful security invariants, including capabilities, pointer integrity, watchpoints, and information-flow tracking. A single-bit tag shadowspace is the most commonly proposed requirement, as one bit is the minimum metadata needed to distinguish between an untyped data word and any number of new hardware-enforced types. We survey various tag shadowspace approaches and identify their common requirements and positive features of their implementations. To avoid non-standard memory widths, we identify the most practical implementation for tag storage to be an in-memory table managed next to the DRAM controller. We characterize the caching performance of such a tag table and demonstrate a DRAM traffic overhead below 5\% for the vast majority of applications. We identify spatial locality on a page scale as the primary factor that enables surprisingly high table cache-ability. We then demonstrate tag-table compression for a set of common applications. A hierarchical structure with elegantly simple optimizations reduces DRAM traffic overhead to below 1\% for most applications. These insights and optimizations pave the way for commercial applications making use of single-bit tags stored in commodity memory.},
booktitle = {2017 {IEEE} {International} {Conference} on {Computer} {Design} ({ICCD})},
author = {Joannou, Alexandre and Woodruff, Jonathan and Kovacsics, Robert and Moore, Simon W. and Bradbury, Alex and Xia, Hongyan and Watson, Robert N.M. and Chisnall, David and Roe, Michael and Davis, Brooks and Napierala, Edward and Baldwin, John and Gudka, Khilan and Neumann, Peter G. and Mazzinghi, Alfredo and Richardson, Alex and Son, Stacey and Markettos, A. Theodore},
month = nov,
year = {2017},
note = {ISSN: 1063-6404},
keywords = {Caches, Computer architecture, Error correction codes, Hardware, Memory, Metadata, Pipelines, Processor, Random access memory, Safety, Security, r265-week-6},
pages = {641--648},
}
@inproceedings{xia_cherivoke_2019,
address = {New York, NY, USA},
series = {{MICRO} '52},
title = {{CHERIvoke}: {Characterising} {Pointer} {Revocation} using {CHERI} {Capabilities} for {Temporal} {Memory} {Safety}},
isbn = {978-1-4503-6938-1},
shorttitle = {{CHERIvoke}},
url = {https://doi.org/10.1145/3352460.3358288},
doi = {10.1145/3352460.3358288},
abstract = {A lack of temporal safety in low-level languages has led to an epidemic of use-after-free exploits. These have surpassed in number and severity even the infamous buffer-overflow exploits violating spatial safety. Capability addressing can directly enforce spatial safety for the C language by enforcing bounds on pointers and by rendering pointers unforgeable. Nevertheless, an efficient solution for strong temporal memory safety remains elusive. CHERI is an architectural extension to provide hardware capability addressing that is seeing significant commercial and open-source interest. We show that CHERI capabilities can be used as a foundation to enable low-cost heap temporal safety by facilitating out-of-date pointer revocation, as capabilities enable precise and efficient identification and invalidation of pointers, even when using unsafe languages such as C. We develop CHERIvoke, a technique for deterministic and fast sweeping revocation to enforce temporal safety on CHERI systems. CHERIvoke quarantines freed data before periodically using a small shadow map to revoke all dangling pointers in a single sweep of memory, and provides a tunable trade-off between performance and heap growth. We evaluate the performance of such a system using high-performance x86 processors, and further analytically examine its primary overheads. When configured with a heap-size overhead of 25\%, we find that CHERIvoke achieves an average execution-time overhead of under 5\%, far below the overheads associated with traditional garbage collection, revocation, or page-table systems.},
urldate = {2021-11-10},
booktitle = {Proceedings of the 52nd {Annual} {IEEE}/{ACM} {International} {Symposium} on {Microarchitecture}},
publisher = {Association for Computing Machinery},
author = {Xia, Hongyan and Woodruff, Jonathan and Ainsworth, Sam and Filardo, Nathaniel W. and Roe, Michael and Richardson, Alexander and Rugg, Peter and Neumann, Peter G. and Moore, Simon W. and Watson, Robert N. M. and Jones, Timothy M.},
month = oct,
year = {2019},
keywords = {architecture, r265-week-6, security, temporal safety, use-after-free},
pages = {545--557},
}
@inproceedings{watson_cheri_2015,
title = {{CHERI}: {A} {Hybrid} {Capability}-{System} {Architecture} for {Scalable} {Software} {Compartmentalization}},
shorttitle = {{CHERI}},
doi = {10.1109/SP.2015.9},
abstract = {CHERI extends a conventional RISC Instruction-Set Architecture, compiler, and operating system to support fine-grained, capability-based memory protection to mitigate memory-related vulnerabilities in C-language TCBs. We describe how CHERI capabilities can also underpin a hardware-software object-capability model for application compartmentalization that can mitigate broader classes of attack. Prototyped as an extension to the open-source 64-bit BERI RISC FPGA soft-core processor, Free BSD operating system, and LLVM compiler, we demonstrate multiple orders-of-magnitude improvement in scalability, simplified programmability, and resulting tangible security benefits as compared to compartmentalization based on pure Memory-Management Unit (MMU) designs. We evaluate incrementally deployable CHERI-based compartmentalization using several real-world UNIX libraries and applications.},
booktitle = {2015 {IEEE} {Symposium} on {Security} and {Privacy}},
author = {Watson, Robert N.M. and Woodruff, Jonathan and Neumann, Peter G. and Moore, Simon W. and Anderson, Jonathan and Chisnall, David and Dave, Nirav and Davis, Brooks and Gudka, Khilan and Laurie, Ben and Murdoch, Steven J. and Norton, Robert and Roe, Michael and Son, Stacey and Vadera, Munraj},
month = may,
year = {2015},
note = {ISSN: 2375-1207},
keywords = {CHERI processor, Hardware, Kernel, Libraries, Reduced instruction set computing, Registers, Security, capability system, computer architecture, memory protection, object capabilities, r265-week-6, software compartmentalization},
pages = {20--37},
}
@inproceedings{benson_network_2010,
address = {New York, NY, USA},
series = {{IMC} '10},
title = {Network traffic characteristics of data centers in the wild},
isbn = {978-1-4503-0483-2},
url = {https://doi.org/10.1145/1879141.1879175},
doi = {10.1145/1879141.1879175},
abstract = {Although there is tremendous interest in designing improved networks for data centers, very little is known about the network-level traffic characteristics of data centers today. In this paper, we conduct an empirical study of the network traffic in 10 data centers belonging to three different categories, including university, enterprise campus, and cloud data centers. Our definition of cloud data centers includes not only data centers employed by large online service providers offering Internet-facing applications but also data centers used to host data-intensive (MapReduce style) applications). We collect and analyze SNMP statistics, topology and packet-level traces. We examine the range of applications deployed in these data centers and their placement, the flow-level and packet-level transmission properties of these applications, and their impact on network and link utilizations, congestion and packet drops. We describe the implications of the observed traffic patterns for data center internal traffic engineering as well as for recently proposed architectures for data center networks.},
urldate = {2021-11-09},
booktitle = {Proceedings of the 10th {ACM} {SIGCOMM} conference on {Internet} measurement},
publisher = {Association for Computing Machinery},
author = {Benson, Theophilus and Akella, Aditya and Maltz, David A.},
month = nov,
year = {2010},
keywords = {characterization, data center traffic},
pages = {267--280},
}
@inproceedings{ling_torward_2014,
title = {{TorWard}: {Discovery} of malicious traffic over {Tor}},
shorttitle = {{TorWard}},
doi = {10.1109/INFOCOM.2014.6848074},
abstract = {Tor is a popular low-latency anonymous communication system. However, it is currently abused in various ways. Tor exit routers are frequently troubled by administrative and legal complaints. To gain an insight into such abuse, we design and implement a novel system, TorWard, for the discovery and systematic study of malicious traffic over Tor. The system can avoid legal and administrative complaints and allows the investigation to be performed in a sensitive environment such as a university campus. An IDS (Intrusion Detection System) is used to discover and classify malicious traffic. We performed comprehensive analysis and extensive real-world experiments to validate the feasibility and effectiveness of TorWard. Our data shows that around 10\% Tor traffic can trigger IDS alerts. Malicious traffic includes P2P traffic, malware traffic (e.g., botnet traffic), DoS (Denial-of-Service) attack traffic, spam, and others. Around 200 known malware have been identified. To the best of our knowledge, we are the first to perform malicious traffic categorization over Tor.},
booktitle = {{IEEE} {INFOCOM} 2014 - {IEEE} {Conference} on {Computer} {Communications}},
author = {Ling, Zhen and Luo, Junzhou and Wu, Kui and Yu, Wei and Fu, Xinwen},
month = apr,
year = {2014},
note = {ISSN: 0743-166X},
keywords = {Bandwidth, Computers, Intrusion Detection System, Logic gates, Malicious Traffic, Malware, Mobile handsets, Ports (Computers), Servers, Tor},
pages = {1402--1410},
}
@techreport{sharangpani_statistical_1994,
title = {Statistical {Analysis} of {Floating} {Point} {Flaw} in the {Pentium} {Processor}},
urldate = {2021-11-04},
institution = {Intel Corporation},
author = {Sharangpani, H. P. and Barton, M. L.},
month = nov,
year = {1994},
}
@article{bin_using_2002,
title = {Using a constraint satisfaction formulation and solution techniques for random test program generation},
volume = {41},
issn = {0018-8670},
doi = {10.1147/sj.413.0386},
abstract = {Automatic generation of test programs plays a major role in the verification of modern processors and hardware systems. In this paper, we formulate the generation of test programs as a constraint satisfaction problem and develop techniques for dealing with the challenges we face, most notably: huge variable domains (e.g., magnitude of 264) and the need to randomly generate “well distributed” samplings of the solution space. We describe several applications of our method, which include specific test generators targeted at various parts of a design or stages of the verification process.},
number = {3},
journal = {IBM Systems Journal},
author = {Bin, E. and Emek, R. and Shurek, G. and Ziv, A.},
year = {2002},
note = {Conference Name: IBM Systems Journal},
pages = {386--402},
}
@misc{lee_ucb-barriscv-torture_2012,
title = {ucb-bar/riscv-torture},
url = {https://github.com/ucb-bar/riscv-torture},
abstract = {RISC-V Torture Test},
urldate = {2021-11-03},
publisher = {UC Berkeley Architecture Research},
author = {Lee, Yunsup and Cook, Henry},
month = jan,
year = {2012},
keywords = {r265-week-5},
}
@article{campbell_randomised_2016,
series = {Formal {Methods} for {Industrial} {Critical} {Systems} ({FMICS}2014)},
title = {Randomised testing of a microprocessor model using {SMT}-solver state generation},
volume = {118},
issn = {0167-6423},
url = {https://www.sciencedirect.com/science/article/pii/S0167642315003159},
doi = {10.1016/j.scico.2015.10.012},
abstract = {We validate a HOL4 model of the ARM Cortex-M0 microcontroller core by testing the model's behaviour on randomly chosen instructions against real chips from several manufacturers. The model and our intended application involve precise timing information about instruction execution, but the implementations are pipelined, so checking the behaviour of single instructions would not give us sufficient confidence in the model. Thus we test the model using sequences of randomly chosen instructions. The main challenge is to meet the constraints on the initial and intermediate execution states: we must ensure that memory accesses are in range and that we respect restrictions on the instructions. By careful transformation of these constraints an off-the-shelf SMT solver can be used to find suitable states for executing test sequences. We also use additional constraints to test our hypotheses about the timing anomalies encountered.},
language = {en},
urldate = {2021-11-03},
journal = {Science of Computer Programming},
author = {Campbell, Brian and Stark, Ian},
month = mar,
year = {2016},
keywords = {HOL, Microprocessor models, Randomised testing, SMT, r265-week-5},
pages = {60--76},
}
@article{adir_genesys-pro_2004,
title = {Genesys-{Pro}: innovations in test program generation for functional processor verification},
volume = {21},
issn = {1558-1918},
shorttitle = {Genesys-{Pro}},
doi = {10.1109/MDT.2004.1277900},
abstract = {Functional verification is widely recognized as the bottleneck of the hardware design cycle. With the ever-growing demand for greater performance and faster time to market, coupled with the exponential growth in hardware size, verification has become increasingly difficult. Although formal methods such as model checking and theorem proving have resulted in noticeable progress, these approaches apply only to the verification of relatively small design blocks or to very focused verification goals. Current industry practice is to use separate, automatic, random stimuli generators for processor- and multiprocessor-level verification. The generated stimuli, usually in the form of test programs, trigger architecture and microarchitecture events defined by a verification plan. MAC-based algorithms are well suited for the test program generation domain because they postpone heuristic decisions until after consideration of all architectural and testing-knowledge constraints. Geneysys-Pro is currently the main test generation tool for functional verification of IBM processors, including several complex processors. We've found that the new language considerably reduces the effort needed to define and maintain knowledge specific to an implementation and verification plan.},
number = {2},
journal = {IEEE Design Test of Computers},
author = {Adir, A. and Almog, E. and Fournier, L. and Marcus, E. and Rimon, M. and Vinov, M. and Ziv, A.},
month = mar,
year = {2004},
note = {Conference Name: IEEE Design Test of Computers},
keywords = {Computer languages, Design engineering, Engines, Knowledge engineering, Microprocessors, Power generation, Power system modeling, Spine, Technological innovation, Testing, r265-week-5},
pages = {84--93},
}
@misc{noauthor_riscv_2021,
title = {{RISCV} {Sail} {Model}},
url = {https://github.com/riscv/sail-riscv},
abstract = {Sail RISC-V model},
urldate = {2021-11-03},
publisher = {RISC-V},
month = nov,
year = {2021},
note = {original-date: 2018-11-27T15:24:33Z},
keywords = {r265-week-5},
}
@article{armstrong_isa_2019,
title = {{ISA} semantics for {ARMv8}-a, {RISC}-v, and {CHERI}-{MIPS}},
volume = {3},
url = {https://doi.org/10.1145/3290384},
doi = {10.1145/3290384},
abstract = {Architecture specifications notionally define the fundamental interface between hardware and software: the envelope of allowed behaviour for processor implementations, and the basic assumptions for software development and verification. But in practice, they are typically prose and pseudocode documents, not rigorous or executable artifacts, leaving software and verification on shaky ground. In this paper, we present rigorous semantic models for the sequential behaviour of large parts of the mainstream ARMv8-A, RISC-V, and MIPS architectures, and the research CHERI-MIPS architecture, that are complete enough to boot operating systems, variously Linux, FreeBSD, or seL4. Our ARMv8-A models are automatically translated from authoritative ARM-internal definitions, and (in one variant) tested against the ARM Architecture Validation Suite. We do this using a custom language for ISA semantics, Sail, with a lightweight dependent type system, that supports automatic generation of emulator code in C and OCaml, and automatic generation of proof-assistant definitions for Isabelle, HOL4, and (currently only for MIPS) Coq. We use the former for validation, and to assess specification coverage. To demonstrate the usability of the latter, we prove (in Isabelle) correctness of a purely functional characterisation of ARMv8-A address translation. We moreover integrate the RISC-V model into the RMEM tool for (user-mode) relaxed-memory concurrency exploration. We prove (on paper) the soundness of the core Sail type system. We thereby take a big step towards making the architectural abstraction actually well-defined, establishing foundations for verification and reasoning.},
number = {POPL},
urldate = {2021-11-03},
journal = {Proceedings of the ACM on Programming Languages},
author = {Armstrong, Alasdair and Bauereiss, Thomas and Campbell, Brian and Reid, Alastair and Gray, Kathryn E. and Norton, Robert M. and Mundkur, Prashanth and Wassell, Mark and French, Jon and Pulte, Christopher and Flur, Shaked and Stark, Ian and Krishnaswami, Neel and Sewell, Peter},
month = jan,
year = {2019},
keywords = {Instruction Set Architectures, Semantics, Theorem Proving, r265-week-5},
pages = {71:1--71:31},
}
@article{reid_who_2017,
title = {Who guards the guards? formal validation of the {Arm} v8-m architecture specification},
volume = {1},
shorttitle = {Who guards the guards?},
url = {https://doi.org/10.1145/3133912},
doi = {10.1145/3133912},
abstract = {Software and hardware are increasingly being formally verified against specifications, but how can we verify the specifications themselves? This paper explores what it means to formally verify a specification. We solve three challenges: (1) How to create a secondary, higher-level specification that can be effectively reviewed by processor designers who are not experts in formal verification; (2) How to avoid common-mode failures between the specifications; and (3) How to automatically verify the two specifications against each other. One of the most important specifications for software verification is the processor specification since it defines the behaviour of machine code and of hardware protection features used by operating systems. We demonstrate our approach on ARM's v8-M Processor Specification, which is intended to improve the security of Internet of Things devices. Thus, we focus on establishing the security guarantees the architecture is intended to provide. Despite the fact that the ARM v8-M specification had previously been extensively tested, we found twelve bugs (including two security bugs) that have all been fixed by ARM.},
number = {OOPSLA},
urldate = {2021-11-03},
journal = {Proceedings of the ACM on Programming Languages},
author = {Reid, Alastair},
month = oct,
year = {2017},
keywords = {Formal Verification, ISA, Specification, r265-week-5},
pages = {88:1--88:24},
}
@inproceedings{williams_unikernels_2018,
address = {New York, NY, USA},
series = {{SoCC} '18},
title = {Unikernels as {Processes}},
isbn = {978-1-4503-6011-1},
url = {https://doi.org/10.1145/3267809.3267845},
doi = {10.1145/3267809.3267845},
abstract = {System virtualization (e.g., the virtual machine abstraction) has been established as the de facto standard form of isolation in multi-tenant clouds. More recently, unikernels have emerged as a way to reuse VM isolation while also being lightweight by eliminating the general purpose OS (e.g., Linux) from the VM. Instead, unikernels directly run the application (linked with a library OS) on the virtual hardware. In this paper, we show that unikernels do not actually require a virtual hardware abstraction, but can achieve similar levels of isolation when running as processes by leveraging existing kernel system call whitelisting mechanisms. Moreover, we show that running unikernels as processes reduces hardware requirements, enables the use of standard process debugging and management tooling, and improves the already impressive performance that unikernels exhibit.},
urldate = {2021-11-03},
booktitle = {Proceedings of the {ACM} {Symposium} on {Cloud} {Computing}},
publisher = {Association for Computing Machinery},
author = {Williams, Dan and Koller, Ricardo and Lucina, Martin and Prakash, Nikhil},
month = oct,
year = {2018},
keywords = {cloud computing, security, unikernels, virtualization},
pages = {199--211},
}
@article{madhavapeddy_unikernels_2013,
title = {Unikernels: library operating systems for the cloud},
volume = {41},
issn = {0163-5964},
shorttitle = {Unikernels},
url = {https://doi.org/10.1145/2490301.2451167},
doi = {10.1145/2490301.2451167},
abstract = {We present unikernels, a new approach to deploying cloud services via applications written in high-level source code. Unikernels are single-purpose appliances that are compile-time specialised into standalone kernels, and sealed against modification when deployed to a cloud platform. In return they offer significant reduction in image sizes, improved efficiency and security, and should reduce operational costs. Our Mirage prototype compiles OCaml code into unikernels that run on commodity clouds and offer an order of magnitude reduction in code size without significant performance penalty. The architecture combines static type-safety with a single address-space layout that can be made immutable via a hypervisor extension. Mirage contributes a suite of type-safe protocol libraries, and our results demonstrate that the hypervisor is a platform that overcomes the hardware compatibility issues that have made past library operating systems impractical to deploy in the real-world.},
number = {1},
urldate = {2021-11-02},
journal = {ACM SIGARCH Computer Architecture News},
author = {Madhavapeddy, Anil and Mortier, Richard and Rotsos, Charalampos and Scott, David and Singh, Balraj and Gazagnaire, Thomas and Smith, Steven and Hand, Steven and Crowcroft, Jon},
month = mar,
year = {2013},
keywords = {functional programming, hypervisor, microkernel},
pages = {461--472},
}
@inproceedings{atlidakis_posix_2016,
address = {New York, NY, USA},
series = {{EuroSys} '16},
title = {{POSIX} abstractions in modern operating systems: the old, the new, and the missing},
isbn = {978-1-4503-4240-7},
shorttitle = {{POSIX} abstractions in modern operating systems},
url = {https://doi.org/10.1145/2901318.2901350},
doi = {10.1145/2901318.2901350},
abstract = {The POSIX standard, developed 25 years ago, comprises a set of operating system (OS) abstractions that aid application portability across UNIX-based OSes. While OSes and applications have evolved tremendously over the last 25 years, POSIX, and the basic set of abstractions it provides, has remained largely unchanged. Little has been done to measure how and to what extent traditional POSIX abstractions are being used in modern OSes, and whether new abstractions are taking form, dethroning traditional ones. We explore these questions through a study of POSIX usage in modern desktop and mobile OSes: Android, OS X, and Ubuntu. Our results show that new abstractions are taking form, replacing several prominent traditional abstractions in POSIX. While the changes are driven by common needs and are conceptually similar across the three OSes, they are not converging on any new standard, increasing fragmentation.},
urldate = {2021-11-02},
booktitle = {Proceedings of the {Eleventh} {European} {Conference} on {Computer} {Systems}},
publisher = {Association for Computing Machinery},
author = {Atlidakis, Vaggelis and Andrus, Jeremy and Geambasu, Roxana and Mitropoulos, Dimitris and Nieh, Jason},
month = apr,
year = {2016},
pages = {1--17},
}
@inproceedings{winter_how_2012,
title = {How the {Great} {Firewall} of {China} is {Blocking} {Tor}},
url = {https://www.usenix.org/conference/foci12/workshop-program/presentation/winter},
language = {en},
urldate = {2021-10-30},
author = {Winter, Philipp and Lindskog, Stefan},
year = {2012},
}
@techreport{dingledine_tor_2004,
title = {Tor: {The} {Second}-{Generation} {Onion} {Router}},
shorttitle = {Tor},
url = {https://apps.dtic.mil/sti/citations/ADA465464},
abstract = {We present Tor, a circuit-based low-latency anonymous communication service. This second-generation Onion Routing system addresses limitations in the original design by adding perfect forward secrecy, congestion control, directory servers, integrity checking, configurable exit policies, and a practical design for location-hidden services via rendezvous points. Tor works on the real-world Internet, requires no special privileges or kernel modifications, requires little synchronization or coordination between nodes, and provides a reasonable tradeoff between anonymity, usability, and efficiency. We briefly describe our experiences with an international network of more than 30 nodes. We close with a list of open problems in anonymous communication.},
language = {en},
urldate = {2021-10-30},
institution = {NAVAL RESEARCH LAB WASHINGTON DC},
author = {Dingledine, Roger and Mathewson, Nick and Syverson, Paul},
month = jan,
year = {2004},
note = {Section: Technical Reports},
}
@inproceedings{mukherjee_detailed_2002,
title = {Detailed design and evaluation of redundant multi-threading alternatives},
doi = {10.1109/ISCA.2002.1003566},
abstract = {Exponential growth in the number of on-chip transistors, coupled. with reductions in. voltage levels, makes each generation of microprocessors increasingly vulnerable to transient faults. In a multi-threaded environment, we can detect these faults by running two copies of the same program as separate threads, feeding them identical inputs, and comparing their outputs, a technique we call redundant multi-threading (RMT). This paper studies RMT techniques in the context of both single- and dual-processor simultaneous multi-threaded (SMT) single-chip devices. Using a detailed, commercial-grade, SMT processor design we uncover subtle RMT implementation complexities, and find that RMT can be a more significant burden for single-processor devices than prior studies indicate. However, a novel application of RMT techniques in a dual-processor device, which we term chip-level redundant threading, shows higher performance than lock-stepping the two cores, especially on multi-threaded workloads.},
booktitle = {Proceedings 29th {Annual} {International} {Symposium} on {Computer} {Architecture}},
author = {Mukherjee, S.S. and Kontz, M. and Reinhardt, S.K.},
month = may,
year = {2002},
note = {ISSN: 1063-6897},
keywords = {Degradation, Fault detection, Hardware, Microprocessors, Multithreading, Process design, Surface-mount technology, Transistors, Voltage, Yarn},
pages = {99--110},
}
@inproceedings{gomaa_transient-fault_2003,
title = {Transient-fault recovery for chip multiprocessors},
doi = {10.1109/ISCA.2003.1206992},
abstract = {To address the increasing susceptibility of commodity chip multiprocessors (CMPs) to transient faults, we propose Chip-level Redundantly Threaded multiprocessor with Recovery (CRTR). CRTR extends the previously-proposed CRT for transient-fault detection in CMPs, and the previously-proposed SRTR for transient-fault recovery in SMT. All these schemes achieve fault tolerance by executing and comparing two copies, called leading and trailing threads, of a given application. Previous recovery schemes for SMT do not perform well on CMPs. In a CMP, the leading and trailing threads execute on different processors to achieve load balancing and reduce the probability of a fault corrupting both threads; whereas in an SMT, both threads execute on the same processor. The interprocessor communication required to compare the threads introduces latency and bandwidth problems not present in an SMT. To hide interprocessor latency, CRTR executes the leading thread ahead of the trailing thread by maintaining a long slack, enabled by asymmetric commit. CRTR commits the leading thread before checking and the trailing thread after checking, so that the trailing thread state may be used for recovery. Previous recovery schemes commit both threads after checking, making a long slack suboptimal. To tackle interprocessor bandwidth, CRTR not only increases the bandwidth supply by pipelining the communication paths, but also reduces the bandwidth demand. By reasoning that faults propagate through dependences, previously-proposed dependence based checking elision (DBCE) exploits (true) register dependence chains so that only the value of the last instruction in a chain is checked. However, instructions that mask operand bits may mask faults and limit the use of dependence chains. We propose death-and dependence-based checking elision (DDBCE), which chains a masking instruction only if the source operand of the instruction dies after the instruction. Register deaths ensure that masked faults do not corrupt later computation. Using SPEC2000, we show that CRTR incurs negligible performance loss compared to CRT for interprocessor (one-way) latency as high as 30 cycles, and that the bandwidth requirements of CRT and CRTR with DDBCE are 5.2 and 7.1 bytes/cycle, respectively.},
booktitle = {30th {Annual} {International} {Symposium} on {Computer} {Architecture}, 2003. {Proceedings}.},
author = {Gomaa, M. and Scarbrough, C. and Vijaykumar, T.N. and Pomeranz, I.},
month = jun,
year = {2003},
note = {ISSN: 1063-6897},
keywords = {Bandwidth, Cathode ray tubes, Delay, Fault tolerance, Load management, Performance loss, Pipeline processing, Registers, Surface-mount technology, Yarn},
pages = {98--109},
}
@inproceedings{mukherjee_soft_2005,
title = {The soft error problem: an architectural perspective},
shorttitle = {The soft error problem},
doi = {10.1109/HPCA.2005.37},
abstract = {Radiation-induced soft errors have emerged as a key challenge in computer system design. If the industry is to continue to provide customers with the level of reliability they expect, microprocessor architects must address this challenge directly. This effort has two parts. First, architects must understand the impact of soft errors on their designs. Second, they must select judiciously from among available techniques to reduce this impact in order to meet their reliability targets with minimum overhead. To provide a foundation for these efforts, this paper gives a broad overview of the soft error problem from an architectural perspective. We start with basic definitions, followed by a description of techniques to compute the soft error rate. Then, we summarize techniques used to reduce the soft error rate. This paper also describes problems with double-bit errors. Finally, this paper outlines future directions for architecture research in soft errors.},
booktitle = {11th {International} {Symposium} on {High}-{Performance} {Computer} {Architecture}},
author = {Mukherjee, S.S. and Emer, J. and Reinhardt, S.K.},
month = feb,
year = {2005},
note = {ISSN: 2378-203X},
keywords = {Additives, Alpha particles, Computer architecture, Computer errors, Error analysis, Error correction, Particle measurements, Pollution measurement, Semiconductor device measurement, Testing},
pages = {243--247},
}
@article{karlin_decoy_2011,
title = {Decoy {Routing}: {Toward} {Unblockable} {Internet} {Communication}},
abstract = {We present decoy routing, a mechanism capable of circumventing common network filtering strategies. Unlike other circumvention techniques, decoy routing does not require a client to connect to a specific IP address (which is easily blocked) in order to provide circumvention. We show that if it is possible for a client to connect to any unblocked host/service, then decoy routing could be used to connect them to a blocked destination without cooperation from the host. This is accomplished by placing the circumvention service in the network itself where a single device could proxy traffic between a significant fraction of hosts instead of at the edge.},
language = {en},
author = {Karlin, Josh and Ellard, Daniel and Jackson, Alden W and Jones, Christine E and Lauer, Greg and Mankins, David P and Strayer, W Timothy},
year = {2011},
pages = {6},
}
@article{wustrow_telex_2011,
title = {Telex: {Anticensorship} in the {Network} {Infrastructure}},
abstract = {In this paper, we present Telex, a new approach to resisting state-level Internet censorship. Rather than attempting to win the cat-and-mouse game of finding open proxies, we leverage censors unwillingness to completely block day-to-day Internet access. In effect, Telex converts innocuous, unblocked websites into proxies, without their explicit collaboration. We envision that friendly ISPs would deploy Telex stations on paths between censors networks and popular, uncensored Internet destinations. Telex stations would monitor seemingly innocuous flows for a special “tag” and transparently divert them to a forbidden website or service instead. We propose a new cryptographic scheme based on elliptic curves for tagging TLS handshakes such that the tag is visible to a Telex station but not to a censor. In addition, we use our tagging scheme to build a protocol that allows clients to connect to Telex stations while resisting both passive and active attacks. We also present a proof-of-concept implementation that demonstrates the feasibility of our system.},
language = {en},
author = {Wustrow, Eric and Wolchok, Scott and Goldberg, Ian and Halderman, J Alex},
year = {2011},
pages = {15},
}
@inproceedings{houmansadr_cirripede_2011,
address = {New York, NY, USA},
series = {{CCS} '11},
title = {Cirripede: circumvention infrastructure using router redirection with plausible deniability},
isbn = {978-1-4503-0948-6},
shorttitle = {Cirripede},
url = {https://doi.org/10.1145/2046707.2046730},
doi = {10.1145/2046707.2046730},
abstract = {Many users face surveillance of their Internet communications and a significant fraction suffer from outright blocking of certain destinations. Anonymous communication systems allow users to conceal the destinations they communicate with, but do not hide the fact that the users are using them. The mere use of such systems may invite suspicion, or access to them may be blocked. We therefore propose Cirripede, a system that can be used for unobservable communication with Internet destinations. Cirripede is designed to be deployed by ISPs; it intercepts connections from clients to innocent-looking destinations and redirects them to the true destination requested by the client. The communication is encoded in a way that is indistinguishable from normal communications to anyone without the master secret key, while public-key cryptography is used to eliminate the need for any secret information that must be shared with Cirripede users. Cirripede is designed to work scalably with routers that handle large volumes of traffic while imposing minimal overhead on ISPs and not disrupting existing traffic. This allows Cirripede proxies to be strategically deployed at central locations, making access to Cirripede very difficult to block. We built a proof-of-concept implementation of Cirripede and performed a testbed evaluation of its performance properties.},
urldate = {2021-10-25},
booktitle = {Proceedings of the 18th {ACM} conference on {Computer} and communications security},
publisher = {Association for Computing Machinery},
author = {Houmansadr, Amir and Nguyen, Giang T.K. and Caesar, Matthew and Borisov, Nikita},
month = oct,
year = {2011},
keywords = {censorship-resistance, unobservability},
pages = {187--200},
}
@inproceedings{wustrow_tapdance_2014,
title = {{TapDance}: {End}-to-{Middle} {Anticensorship} without {Flow} {Blocking}},
isbn = {978-1-931971-15-7},
shorttitle = {{TapDance}},
url = {https://www.usenix.org/conference/usenixsecurity14/technical-sessions/presentation/wustrow},
language = {en},
urldate = {2021-10-24},
author = {Wustrow, Eric and Swanson, Colleen M. and Halderman, J. Alex},
year = {2014},
pages = {159--174},
}
@inproceedings{frolov_isp-scale_2017,
title = {An {ISP}-{Scale} {Deployment} of {TapDance}},
url = {https://www.usenix.org/conference/foci17/workshop-program/presentation/frolov},
language = {en},
urldate = {2021-10-24},
author = {Frolov, Sergey and Douglas, Fred and Scott, Will and McDonald, Allison and VanderSloot, Benjamin and Hynes, Rod and Kruger, Adam and Kallitsis, Michalis and Robinson, David G. and Schultze, Steve and Borisov, Nikita and Halderman, Alex and Wustrow, Eric},
year = {2017},
keywords = {r02-essay-1},
}
@inproceedings{andersen_accountable_2008,
address = {New York, NY, USA},
series = {{SIGCOMM} '08},
title = {Accountable internet protocol (aip)},
isbn = {978-1-60558-175-0},
url = {https://doi.org/10.1145/1402958.1402997},
doi = {10.1145/1402958.1402997},
abstract = {This paper presents AIP (Accountable Internet Protocol), a network architecture that provides accountability as a first-order property. AIP uses a hierarchy of self-certifying addresses, in which each component is derived from the public key of the corresponding entity. We discuss how AIP enables simple solutions to source spoofing, denial-of-service, route hijacking, and route forgery. We also discuss how AIP's design meets the challenges of scaling, key management, and traffic engineering.},
urldate = {2021-10-24},
booktitle = {Proceedings of the {ACM} {SIGCOMM} 2008 conference on {Data} communication},
publisher = {Association for Computing Machinery},
author = {Andersen, David G. and Balakrishnan, Hari and Feamster, Nick and Koponen, Teemu and Moon, Daekyeong and Shenker, Scott},
month = aug,
year = {2008},
keywords = {accountability, address, internet architecture, r02-essay-1, scalability, security},
pages = {339--350},
}
@inproceedings{smolens_reunion_2006,
address = {USA},
series = {{MICRO} 39},
title = {Reunion: {Complexity}-{Effective} {Multicore} {Redundancy}},
isbn = {978-0-7695-2732-1},
shorttitle = {Reunion},
url = {https://doi.org/10.1109/MICRO.2006.42},
doi = {10.1109/MICRO.2006.42},
abstract = {To protect processor logic from soft errors, multicore redundant architectures execute two copies of a program on separate cores of a chip multiprocessor (CMP). Maintaining identical instruction streams is challenging because redundant cores operate independently, yet must still receive the same inputs (e.g., load values and shared-memory invalidations). Past proposals strictly replicate load values across two cores, requiring significant changes to the highly-optimized core. We make the key observation that, in the common case, both cores load identical values without special hardware. When the cores do receive different load values (e.g., due to a data race), the same mechanisms employed for soft error detection and recovery can correct the difference. This observation permits designs that relax input replication, while still providing correct redundant execution. In this paper, we present Reunion, an execution model that provides relaxed input replication and preserves the existing memory interface, coherence protocols, and consistency models. We evaluate a CMP-based implementation of the Reunion execution model with full-system, cycle-accurate simulation. We show that the performance overhead of relaxed input replication is only 5\% and 6\% for commercial and scientific workloads, respectively.},
urldate = {2021-10-22},
booktitle = {Proceedings of the 39th {Annual} {IEEE}/{ACM} {International} {Symposium} on {Microarchitecture}},
publisher = {IEEE Computer Society},
author = {Smolens, Jared C. and Gold, Brian T. and Falsafi, Babak and Hoe, James C.},
month = dec,
year = {2006},
keywords = {r265-week-4},
pages = {223--234},
}
@inproceedings{gupta_stageweb_2010,
title = {{StageWeb}: {Interweaving} pipeline stages into a wearout and variation tolerant {CMP} fabric},
shorttitle = {{StageWeb}},
doi = {10.1109/DSN.2010.5544915},
abstract = {Manufacture-time process variation and life-time failure projections have become a major industry concern. Consequently, fault tolerance, historically of interest only for mission-critical systems, is now gaining attention in the mainstream computing space. Traditionally reliability issues have been addressed at a coarse granularity, e.g., by disabling faulty cores in chip multiprocessors. However, this is not scalable to higher failure rates. In this paper, we propose StageWeb, a fine-grained wearout and variation tolerance solution, that employs a reconfigurable web of replicated processor pipeline stages to construct dependable many-core chips. The interconnection flexibility of StageWeb simultaneously tackles wearout failures (by isolating broken stages) and process variation (by selectively disabling slower stages). Our experiments show that through its wearout tolerance, a StageWeb chip performs up to 70\% more cumulative work than a comparable chip multiprocessor. Further, variation mitigation in StageWeb enables it to scale supply voltage more aggressively, resulting in up to 16\% energy savings.},
booktitle = {2010 {IEEE}/{IFIP} {International} {Conference} on {Dependable} {Systems} {Networks} ({DSN})},
author = {Gupta, Shantanu and Ansari, Amin and Feng, Shuguang and Mahlke, Scott},
month = jun,
year = {2010},
note = {ISSN: 2158-3927},
keywords = {Fabrics, Pipelines, architecture, multicore, permanent faults, process variation, r265-week-4, reliability},
pages = {101--110},
}
@inproceedings{lafrieda_utilizing_2007,
title = {Utilizing {Dynamically} {Coupled} {Cores} to {Form} a {Resilient} {Chip} {Multiprocessor}},
doi = {10.1109/DSN.2007.100},
abstract = {Aggressive CMOS scaling will make future chip multiprocessors (CMPs) increasingly susceptible to transient faults, hard errors, manufacturing defects, and process variations. Existing fault-tolerant CMP proposals that implement dual modular redundancy (DMR) do so by statically binding pairs of adjacent cores via dedicated communication channels and buffers. This can result in unnecessary power and performance losses in cases where one core is defective (in which case the entire DMR pair must be disabled), or when cores exhibit different frequency/leakage characteristics due to process variations (in which case the pair runs at the speed of the slowest core). Static DMR also hinders power density/thermal management, as DMR pairs running code with similar power/thermal characteristics are necessarily placed next to each other on the die. We present dynamic core coupling (DCC), an architectural technique that allows arbitrary CMP cores to verify each other's execution while requiring no static core binding at design time or dedicated communication hardware. Our evaluation shows that the performance overhead of DCC over a CMP without fault tolerance is 3\% on SPEC2000 benchmarks, and is within 5\% for a set of scalable parallel scientific and data mining applications with up to eight threads (16 processors). Our results also show that DCC has the potential to significantly outperform existing static DMR schemes.},
booktitle = {37th {Annual} {IEEE}/{IFIP} {International} {Conference} on {Dependable} {Systems} and {Networks} ({DSN}'07)},
author = {LaFrieda, Christopher and Ipek, Engin and Martinez, Jose F. and Manohar, Rajit},
month = jun,
year = {2007},
note = {ISSN: 2158-3927},
keywords = {CMOS process, Communication channels, Energy management, Fault tolerance, Frequency, Manufacturing processes, Performance loss, Proposals, Redundancy, Thermal management, r265-week-4},
pages = {317--326},
}
@article{dahlgren_evaluation_1996,
title = {Evaluation of hardware-based stride and sequential prefetching in shared-memory multiprocessors},
volume = {7},
issn = {10459219},
url = {http://ieeexplore.ieee.org/document/494633/},
doi = {10.1109/71.494633},
abstract = {We study the efficiency of previously proposed stride and sequential prefetching—two promising hardware-based prefetching schemes to reduce readmiss penalties in shared-memory multiprocessors. Although stride accesses dominate in four out of six of the applications we study, we find that sequential prefetching does as well as and in same cases even better than stride prefetching for five applications. This is because (i) most strides are shorter than the block size (we assume 32 byte blocks), which means that sequential prefetching is as effective for these stride accesses, and (ii) sequential prefetching also exploits the locality of read misses with non-stride accesses. However, since stride prefetching in general results in fewer useless prefetches, it offers the extra advantage of consuming less memory-system bandwidth.},
language = {en},
number = {4},
urldate = {2021-10-21},
journal = {IEEE Transactions on Parallel and Distributed Systems},
author = {Dahlgren, F. and Stenstrom, P.},
month = apr,
year = {1996},
pages = {385--398},
}
@inproceedings{pugsley_sandbox_2014,
title = {Sandbox {Prefetching}: {Safe} run-time evaluation of aggressive prefetchers},
shorttitle = {Sandbox {Prefetching}},
doi = {10.1109/HPCA.2014.6835971},
abstract = {Memory latency is a major factor in limiting CPU performance, and prefetching is a well-known method for hiding memory latency. Overly aggressive prefetching can waste scarce resources such as memory bandwidth and cache capacity, limiting or even hurting performance. It is therefore important to employ prefetching mechanisms that use these resources prudently, while still prefetching required data in a timely manner. In this work, we propose a new mechanism to determine at run-time the appropriate prefetching mechanism for the currently executing program, called Sandbox Prefetching. Sandbox Prefetching evaluates simple, aggressive offset prefetchers at run-time by adding the prefetch address to a Bloom filter, rather than actually fetching the data into the cache. Subsequent cache accesses are tested against the contents of the Bloom filter to see if the aggressive prefetcher under evaluation could have accurately prefetched the data, while simultaneously testing for the existence of prefetchable streams. Real prefetches are performed when the accuracy of evaluated prefetchers exceeds a threshold. This method combines the ideas of global pattern confirmation and immediate prefetching action to achieve high performance. Sandbox Prefetching improves performance across the tested workloads by 47.6\% compared to not using any prefetching, and by 18.7\% compared to the Feedback Directed Prefetching technique. Performance is also improved by 1.4\% compared to the Access Map Pattern Matching Prefetcher, while incurring considerably less logic and storage overheads.},
booktitle = {2014 {IEEE} 20th {International} {Symposium} on {High} {Performance} {Computer} {Architecture} ({HPCA})},
author = {Pugsley, Seth H and Chishti, Zeshan and Wilkerson, Chris and Chuang, Peng-fei and Scott, Robert L and Jaleel, Aamer and Lu, Shih-Lien and Chow, Kingsum and Balasubramonian, Rajeev},
month = feb,
year = {2014},
note = {ISSN: 2378-203X},
keywords = {Accuracy, Bandwidth, Monitoring, Pattern matching, Prefetching, Radiation detectors},
pages = {626--637},
}
@inproceedings{ebrahimi_techniques_2009,
title = {Techniques for bandwidth-efficient prefetching of linked data structures in hybrid prefetching systems},
doi = {10.1109/HPCA.2009.4798232},
abstract = {Linked data structure (LDS) accesses are critical to the performance of many large scale applications. Techniques have been proposed to prefetch such accesses. Unfortunately, many LDS prefetching techniques 1) generate a large number of useless prefetches, thereby degrading performance and bandwidth efficiency, 2) require significant hardware or storage cost, or 3) when employed together with stream-based prefetchers, cause significant resource contention in the memory system. As a result, existing processors do not employ LDS prefetchers even though they commonly employ stream-based prefetchers. This paper proposes a low-cost hardware/software cooperative technique that enables bandwidth-efficient prefetching of linked data structures. Our solution has two new components: 1) a compiler-guided prefetch filtering mechanism that informs the hardware about which pointer addresses to prefetch, 2) a coordinated prefetcher throttling mechanism that uses run-time feedback to manage the interference between multiple prefetchers (LDS and stream-based) in a hybrid prefetching system. Evaluations show that the proposed solution improves average performance by 22.5\% while decreasing memory bandwidth consumption by 25\% over a baseline system that employs an effective stream prefetcher on a set of memory- and pointer-intensive applications. We compare our proposal to three different LDS/correlation prefetching techniques and find that it provides significantly better performance on both single-core and multi-core systems, while requiring less hardware cost.},
booktitle = {2009 {IEEE} 15th {International} {Symposium} on {High} {Performance} {Computer} {Architecture}},
author = {Ebrahimi, Eiman and Mutlu, Onur and Patt, Yale N.},
month = feb,
year = {2009},
note = {ISSN: 2378-203X},
keywords = {Bandwidth, Costs, Data structures, Degradation, Feedback, Filtering, Hardware, Large-scale systems, Prefetching, Runtime},
pages = {7--17},
}
@inproceedings{nesbit_data_2004,
title = {Data {Cache} {Prefetching} {Using} a {Global} {History} {Buffer}},
doi = {10.1109/HPCA.2004.10030},
abstract = {A new structure for implementing data cache prefetching is proposed and analyzed via simulation. The structure is based on a Global History Buffer that holds the most recent miss addresses in FIFO order. Linked lists within this global history buffer connect addresses that have some common property, e.g. they were all generated by the same load instruction. The Global History Buffer can be used for implementing a number of previously proposed prefetch methods, as well as new ones. Prefetching with the Global History Buffer has two significant advantages over conventional table prefetching methods. First, the use of a FIFO history buffer can improve the accuracy of correlation prefetching by eliminating stale data from the table. Second, the Global History Buffer contains a more complete (and intact) picture of cache miss history, creating opportunities to design more effective prefetching methods. Global History Buffer prefetching can increase correlation prefetching performance by 20\% and cut its memory traffic by 90\%. Furthermore, the Global History Buffer can make correlations within a loads address stream, which can increase stride prefetching performance by 6\%. Collectively, the Global History Buffer prefetching methods perform as well or better than the conventional prefetching methods studied on 14 of 15 benchmarks.},
booktitle = {10th {International} {Symposium} on {High} {Performance} {Computer} {Architecture} ({HPCA}'04)},
author = {Nesbit, K.J. and Smith, J.E.},
month = feb,
year = {2004},
note = {ISSN: 1530-0897},
keywords = {Analytical models, Cache memory, Clocks, Computational modeling, Computer simulation, Delay, History, Microarchitecture, Microprocessors, Prefetching},
pages = {96--96},
}
@inproceedings{wang_guided_2003,
title = {Guided region prefetching: a cooperative hardware/software approach},
shorttitle = {Guided region prefetching},
doi = {10.1109/ISCA.2003.1207016},
abstract = {Despite large caches, main-memory access latencies still cause significant performance losses in many applications. Numerous hardware and software prefetching schemes have been proposed to tolerate these latencies. Software prefetching typically provides better prefetch accuracy than hardware, but is limited by prefetch instruction overheads and the compiler's limited ability to schedule prefetches sufficiently far in advance to cover level-two cache miss latencies. Hardware prefetching can be effective at hiding these large latencies, but generates many useless prefetches and consumes considerable memory bandwidth. We propose a cooperative hardware-software prefetching scheme called guided region prefetching (GRP), which uses compiler-generated hints encoded in load instructions to regulate an aggressive hardware prefetching engine. We compare GRP against a sophisticated pure hardware stride prefetcher and a scheduled region prefetching (SRP) engine. SRP and GRP show the best performance, with respective 22\% and 21\% gains over no prefetching, but SRP incurs 180\% extra memory traffic-nearly tripling bandwidth requirements. GRP achieves performance close to SRP, but with a mere eighth of the extra prefetching traffic, a 23\% increase over no prefetching. The GRP hardware-software collaboration thus combines the accuracy of compiler-based program analysis with the performance potential of aggressive hardware prefetching, bringing the performance gap versus a perfect L2 cache under 20\%.},
booktitle = {30th {Annual} {International} {Symposium} on {Computer} {Architecture}, 2003. {Proceedings}.},
author = {Wang, Zhenlin and Burger, D. and McKinley, K.S. and Reinhardt, S.K. and Weems, C.C.},
month = jun,
year = {2003},
note = {ISSN: 1063-6897},
keywords = {Application software, Bandwidth, Collaboration, Delay, Engines, Hardware, Performance gain, Performance loss, Prefetching, Program processors},
pages = {388--398},
}
@inproceedings{michaud_best-offset_2016,
address = {Barcelona, Spain},
title = {Best-offset hardware prefetching},
isbn = {978-1-4673-9211-2},
url = {http://ieeexplore.ieee.org/document/7446087/},
doi = {10.1109/HPCA.2016.7446087},
abstract = {Hardware prefetching is an important feature of modern highperformance processors. When the application working set is too large to fit in on-chip caches, disabling hardware prefetchers may result in severe performance reduction. A new prefetcher was recently introduced, the Sandbox prefetcher, that tries to find dynamically the best prefetch offset using the sandbox method. The Sandbox prefetcher uses simple hardware and was shown to be quite effective. However, the sandbox method does not take into account prefetch timeliness. We propose an offset prefetcher with a new method for selecting the prefetch offset that takes into account prefetch timeliness. We show that our Best-Offset prefetcher outperforms the Sandbox prefetcher on the SPEC CPU2006 benchmarks, with equally simple hardware.},
language = {en},
urldate = {2021-10-19},
booktitle = {2016 {IEEE} {International} {Symposium} on {High} {Performance} {Computer} {Architecture} ({HPCA})},
publisher = {IEEE},
author = {Michaud, Pierre},
month = mar,
year = {2016},
keywords = {r265-week-3},
pages = {469--480},
}
@inproceedings{lu_end--end_2019,
address = {Amsterdam Netherlands},
title = {An {End}-to-{End}, {Large}-{Scale} {Measurement} of {DNS}-over-{Encryption}: {How} {Far} {Have} {We} {Come}?},
isbn = {978-1-4503-6948-0},
shorttitle = {An {End}-to-{End}, {Large}-{Scale} {Measurement} of {DNS}-over-{Encryption}},
url = {https://dl.acm.org/doi/10.1145/3355369.3355580},
doi = {10.1145/3355369.3355580},
abstract = {DNS packets are designed to travel in unencrypted form through the Internet based on its initial standard. Recent discoveries show that real-world adversaries are actively exploiting this design vulnerability to compromise Internet users security and privacy. To mitigate such threats, several protocols have been proposed to encrypt DNS queries between DNS clients and servers, which we jointly term as DNS-over-Encryption. While some proposals have been standardized and are gaining strong support from the industry, little has been done to understand their status from the view of global users.},
language = {en},
urldate = {2021-10-19},
booktitle = {Proceedings of the {Internet} {Measurement} {Conference}},
publisher = {ACM},
author = {Lu, Chaoyi and Liu, Baojun and Li, Zhou and Hao, Shuang and Duan, Haixin and Zhang, Mingming and Leng, Chunying and Liu, Ying and Zhang, Zaifeng and Wu, Jianping},
month = oct,
year = {2019},
pages = {22--35},
}
@misc{noauthor_https_nodate,
title = {{HTTPS} encryption on the web {Google} {Transparency} {Report}},
url = {https://transparencyreport.google.com/https/overview?hl=en_GB},
urldate = {2021-10-18},
}
@misc{noauthor_cloudflare_2021,
title = {Cloudflare {Images} {Now} {Available} to {Everyone}},
url = {https://blog.cloudflare.com/announcing-cloudflare-images/},
abstract = {Today, we are launching Cloudflare Images for all customers. Images provides a single product to store, resize and serve images. We built Cloudflare Images, so customers of all sizes can build a scalable and affordable image pipeline with a fraction of the effort.},
language = {en},
urldate = {2021-10-18},
journal = {The Cloudflare Blog},
month = sep,
year = {2021},
}
@misc{noauthor_increase_2021,
title = {Increase speeds by over 400\% with {VPN} {Accelerator}},
url = {https://protonvpn.com/blog/vpn-accelerator/},
abstract = {VPN Accelerator uses a combination of advanced VPN technologies to improve connection stability and increase your connection speed},
language = {en-US},
urldate = {2021-10-18},
journal = {ProtonVPN Blog},
month = jul,
year = {2021},
}
@article{klaas_wierenga_eduroam_nodate,
title = {Eduroam: past, present and future},
shorttitle = {Eduroam},
url = {http://www.cmst.eu/articles/eduroam-past-present-and-future},
doi = {10.12921/CMST.2005.11.02.169-173},
abstract = {The number of mobile devices within academia has increased significantly over the last couple of years and users expect to be able to get connectivity everywhere, at home, on the road and at educational institutions. At the same time however, the security of wireless LANs becomes more and more of a concern In 2003, the TERENA Task Force on Mobility [1] was created to look at WLAN security issues and to formulate requirements to design an international roaming solution that would provide National Research and Educational Networks (NRENs) users with secure Internet access at academic campuses across Europe. The solution proposed was tested and proved to be very successful with more and more institutions joining it. This infrastructure is called eduroam, which stands for Education Roaming. Within the 6th framework project GÉANT2 [2], the aim is to expand the existing infrastructure into a pan-European full service for Roaming and Authentication/Authorisation.},
language = {en},
urldate = {2021-10-18},
author = {Klaas Wierenga and Licia Florio},
note = {Publisher: ICHB PAS Poznan Supercomputing and Networking Center},
}
@inproceedings{vallina-rodriguez_beyond_2015,
address = {New York, NY, USA},
series = {{MobiSys} '15},
title = {Beyond the {Radio}: {Illuminating} the {Higher} {Layers} of {Mobile} {Networks}},
isbn = {978-1-4503-3494-5},
shorttitle = {Beyond the {Radio}},
url = {https://doi.org/10.1145/2742647.2742675},
doi = {10.1145/2742647.2742675},
abstract = {Cellular network performance is often viewed as primarily dominated by the radio technology. However, reality proves more complex: mobile operators deploy and configure their networks in different ways, and sometimes establish network sharing agreements with other mobile carriers. Moreover, regulators have encouraged newer operational models such as Mobile Virtual Network Operators (MVNOs) to promote competition. In this paper we draw upon data collected by the ICSI Netalyzr app for Android to characterize how operational decisions, such as network configurations, business models, and relationships between operators introduce diversity in service quality and affect user security and privacy. We delve in detail beyond the radio link and into network configuration and business relationships in six countries. We identify the widespread use of transparent middleboxes such as HTTP and DNS proxies, analyzing how they actively modify user traffic, compromise user privacy, and potentially undermine user security. In addition, we identify network sharing agreements between operators, highlighting the implications of roaming and characterizing the properties of MVNOs, including that a majority are simply rebranded versions of major operators. More broadly, our findings highlight the importance of considering higher-layer relationships when seeking to analyze mobile traffic in a sound fashion.},
urldate = {2021-10-16},
booktitle = {Proceedings of the 13th {Annual} {International} {Conference} on {Mobile} {Systems}, {Applications}, and {Services}},
publisher = {Association for Computing Machinery},
author = {Vallina-Rodriguez, Narseo and Sundaresan, Srikanth and Kreibich, Christian and Weaver, Nicholas and Paxson, Vern},
month = may,
year = {2015},
keywords = {android, cellular networks, dns, http header injection, http proxy, measurement, middleboxes, mobile networks, mobile traffic, pep, privacy, r02-week-3},
pages = {375--387},
}
@article{crowcroft_plutarch_nodate,
title = {Plutarch: {An} {Argument} for {Network} {Pluralism}},
abstract = {It is widely accepted that the current Internet architecture is insufficient for the future: problems such as address space scarcity, mobility and non-universal connectivity are already with us, and stand to be exacerbated by the explosion of wireless, ad-hoc and sensor networks. Furthermore, it is far from clear that the ubiquitous use of standard transport and name resolution protocols will remain practicable or even desirable.},
language = {en},
author = {Crowcroft, Jon and Hand, Steven and Mortier, Richard},
keywords = {r02-week-2},
pages = {10},
}
@incollection{hutchison_haggle_2007,
address = {Berlin, Heidelberg},
title = {Haggle: {Seamless} {Networking} for {Mobile} {Applications}},
volume = {4717},
isbn = {978-3-540-74852-6 978-3-540-74853-3},
shorttitle = {Haggle},
url = {http://link.springer.com/10.1007/978-3-540-74853-3_23},
abstract = {This paper presents Haggle, an architecture for mobile devices that enables seamless network connectivity and application functionality in dynamic mobile environments. Current applications must contain significant network binding and protocol logic, which makes them inflexible to the dynamic networking environments facing mobile devices. Haggle allows separating application logic from transport bindings so that applications can be communication agnostic. Internally, the Haggle framework provides a mechanism for late-binding interfaces, names, protocols, and resources for network communication. This separation allows applications to easily utilize multiple communication modes and methods across infrastructure and infrastructure-less environments. We provide a prototype implementation of the Haggle framework and evaluate it by demonstrating support for two existing legacy applications, email and web browsing. Haggle makes it possible for these applications to seamlessly utilize mobile networking opportunities both with and without infrastructure.},
language = {en},
urldate = {2021-10-12},
booktitle = {{UbiComp} 2007: {Ubiquitous} {Computing}},
publisher = {Springer Berlin Heidelberg},
author = {Su, Jing and Scott, James and Hui, Pan and Crowcroft, Jon and de Lara, Eyal and Diot, Christophe and Goel, Ashvin and Lim, Meng How and Upton, Eben},
editor = {Hutchison, David and Kanade, Takeo and Kittler, Josef and Kleinberg, Jon M. and Mattern, Friedemann and Mitchell, John C. and Naor, Moni and Nierstrasz, Oscar and Pandu Rangan, C. and Steffen, Bernhard and Sudan, Madhu and Terzopoulos, Demetri and Tygar, Doug and Vardi, Moshe Y. and Weikum, Gerhard and Krumm, John and Abowd, Gregory D. and Seneviratne, Aruna and Strang, Thomas},
year = {2007},
doi = {10.1007/978-3-540-74853-3_23},
note = {Series Title: Lecture Notes in Computer Science},
keywords = {r02-week-2},
pages = {391--408},
}
@article{zhang_minnow_2018,
title = {Minnow: {Lightweight} {Offload} {Engines} for {Worklist} {Management} and {Worklist}-{Directed} {Prefetching}},
volume = {53},
issn = {0362-1340},
shorttitle = {Minnow},
url = {https://doi.org/10.1145/3296957.3173197},
doi = {10.1145/3296957.3173197},
abstract = {The importance of irregular applications such as graph analytics is rapidly growing with the rise of Big Data. However, parallel graph workloads tend to perform poorly on general-purpose chip multiprocessors (CMPs) due to poor cache locality, low compute intensity, frequent synchronization, uneven task sizes, and dynamic task generation. At high thread counts, execution time is dominated by worklist synchronization overhead and cache misses. Researchers have proposed hardware worklist accelerators to address scheduling costs, but these proposals often harden a specific scheduling policy and do not address high cache miss rates. We address this with Minnow, a technique that augments each core in a CMP with a lightweight Minnow accelerator. Minnow engines offload worklist scheduling from worker threads to improve scalability. The engines also perform worklist-directed prefetching, a technique that exploits knowledge of upcoming tasks to issue nearly perfectly accurate and timely prefetch operations. On a simulated 64-core CMP running a parallel graph benchmark suite, Minnow improves scalability and reduces L2 cache misses from 29 to 1.2 MPKI on average, resulting in 6.01x average speedup over an optimized software baseline for only 1\% area overhead.},
number = {2},
urldate = {2021-10-16},
journal = {ACM SIGPLAN Notices},
author = {Zhang, Dan and Ma, Xiaoyu and Thomson, Michael and Chiou, Derek},
month = mar,
year = {2018},
keywords = {accelerators, graph analytics, helper threads, parallel architectures, prefetching, r265-week-3, scheduling},
pages = {593--607},
}
@inproceedings{jain_linearizing_2013,
address = {New York, NY, USA},
series = {{MICRO}-46},
title = {Linearizing irregular memory accesses for improved correlated prefetching},
isbn = {978-1-4503-2638-4},
url = {https://doi.org/10.1145/2540708.2540730},
doi = {10.1145/2540708.2540730},
abstract = {This paper introduces the Irregular Stream Buffer (ISB), a prefetcher that targets irregular sequences of temporally correlated memory references. The key idea is to use an extra level of indirection to translate arbitrary pairs of correlated physical addresses into consecutive addresses in a new structural address space, which is visible only to the ISB. This structural address space allows the ISB to organize prefetching meta-data so that it is simultaneously temporally and spatially ordered, which produces technical benefits in terms of coverage, accuracy, and memory traffic overhead. We evaluate the ISB using the Marss full system simulator and the irregular memory-intensive programs of SPEC CPU 2006 for both single-core and multi-core systems. For example, on a single core, the ISB exhibits an average speedup of 23.1\% with 93.7\% accuracy, compared to 9.9\% speedup and 64.2\% accuracy for an idealized prefetcher that over-approximates the STMS prefetcher, the previous best temporal stream prefetcher; this ISB prefetcher uses 32 KB of on-chip storage and sees 8.4\% memory traffic overhead due to meta-data accesses. We also show that a hybrid prefetcher that combines a stride-prefetcher and an ISB with just 8 KB of on-chip storage exhibits 40.8\% speedup and 66.2\% accuracy.},
urldate = {2021-10-16},
booktitle = {Proceedings of the 46th {Annual} {IEEE}/{ACM} {International} {Symposium} on {Microarchitecture}},
publisher = {Association for Computing Machinery},
author = {Jain, Akanksha and Lin, Calvin},
month = dec,
year = {2013},
keywords = {prefetching, r265-week-3},
pages = {247--259},
}
@article{celio_boom_nodate,
title = {{BOOM} v2: an open-source out-of-order {RISC}-{V} core},
abstract = {This paper presents BOOM version 2, an updated version of the Berkeley Out-of-Order Machine first presented in [3]. The design exploration was performed through synthesis, place and route using the foundry-provided standard-cell library and the memory compiler in the TSMC 28 nm HPM process (high performance mobile).},
language = {en},
author = {Celio, Christopher and Chiu, Pi-Feng and Nikolic, Borivoje and Patterson, David and Asanovic, Krste},
pages = {8},
}
@article{celio_berkeley_nodate,
title = {The {Berkeley} {Out}-of-{Order} {Machine} ({BOOM}): {An} {Industry}-{Competitive}, {Synthesizable}, {Parameterized} {RISC}-{V} {Processor}},
language = {en},
author = {Celio, Christopher and Patterson, David and Asanovic, Krste},
pages = {5},
}
@article{chiu_cache_2018,
title = {Cache {Resiliency} {Techniques} for a {Low}-{Voltage} {RISC}-{V} {Out}-of-{Order} {Processor} in 28-nm {CMOS}},
volume = {1},
issn = {2573-9603},
doi = {10.1109/LSSC.2019.2900148},
abstract = {Architecture-level assist techniques enable low-voltage operation by tolerating errors in SRAM-based caches. A line recycling (LR) technique is proposed to reuse faulty cache lines that fail at low voltages to correct errors with only 0.77\% level-2 (L2) area overhead. LR can either save 33\% of cache capacity loss from line disable or allow further reduction in minimum operating voltage (Vmin). Bit bypass implemented with SRAM extends the tag array to log error entries providing multibit-error protection for the metadata with minimal area overhead. An open-source out-of-order superscalar processor that implements the 64-bit RISC-V instruction set architecture is built to validate the proposed cache resiliency techniques. The 2.7 mm × 1.8 mm chip includes one core operating at 1.0 GHz at nominal 0.9 V with 1 MB of L2 cache in a 28-nm HPM process. LR reduces Vmin to 0.47 V, improving energy efficiency by 43\% with negligible impact on CPI.},
number = {12},
journal = {IEEE Solid-State Circuits Letters},
author = {Chiu, Pi-Feng and Celio, Christopher and Asanović, Krste and Nikolić, Borivoje and Patterson, David},
month = dec,
year = {2018},
note = {Conference Name: IEEE Solid-State Circuits Letters},
keywords = {Cache resiliency, Circuit faults, Low voltage, Maintenance engineering, Metadata, RISC-V, Random access memory, Recycling, Resilience, open-source processor, out-of-order processor},
pages = {229--232},
}
@article{davidson_celerity_2018,
title = {The {Celerity} {Open}-{Source} 511-{Core} {RISC}-{V} {Tiered} {Accelerator} {Fabric}: {Fast} {Architectures} and {Design} {Methodologies} for {Fast} {Chips}},
volume = {38},
issn = {1937-4143},
shorttitle = {The {Celerity} {Open}-{Source} 511-{Core} {RISC}-{V} {Tiered} {Accelerator} {Fabric}},
doi = {10.1109/MM.2018.022071133},
abstract = {Rapidly emerging workloads require rapidly developed chips. The Celerity 16-nm open-source SoC was implemented in nine months using an architectural trifecta to minimize development time: a general-purpose tier comprised of open-source Linux-capable RISC-V cores, a massively parallel tier comprised of a RISC-V tiled manycore array that can be scaled to arbitrary sizes, and a specialization tier that uses high-level synthesis (HLS) to create an algorithmic neural-network accelerator. These tiers are tied together with an efficient heterogeneous remote store programming model on top of a flexible partial global address space memory system.},
number = {2},
journal = {IEEE Micro},
author = {Davidson, Scott and Xie, Shaolin and Torng, Christopher and Al-Hawai, Khalid and Rovinski, Austin and Ajayi, Tutu and Vega, Luis and Zhao, Chun and Zhao, Ritchie and Dai, Steve and Amarnath, Aporva and Veluri, Bandhav and Gao, Paul and Rao, Anuj and Liu, Gai and Gupta, Rajesh K. and Zhang, Zhiru and Dreslinski, Ronald and Batten, Christopher and Taylor, Michael Bedford},
month = mar,
year = {2018},
note = {Conference Name: IEEE Micro},
keywords = {Energy efficiency, Memory management, Open source software, Programming, Reduced instruction set computing, System-on-chip, hardware, microchips, r265-week-2},
pages = {30--41},
}
@inproceedings{fuchs_accelerator_2019,
address = {Washington, DC, USA},
title = {The {Accelerator} {Wall}: {Limits} of {Chip} {Specialization}},
isbn = {978-1-72811-444-6},
shorttitle = {The {Accelerator} {Wall}},
url = {https://ieeexplore.ieee.org/document/8675237/},
doi = {10.1109/HPCA.2019.00023},
abstract = {Specializing chips using hardware accelerators has become the prime means to alleviate the gap between the growing computational demands and the stagnating transistor budgets caused by the slowdown of CMOS scaling. Much of the benefits of chip specialization stems from optimizing a computational problem within a given chips transistor budget. Unfortunately, the stagnation of the number of transistors available on a chip will limit the accelerator design optimization space, leading to diminishing specialization returns, ultimately hitting an accelerator wall.},
language = {en},
urldate = {2021-10-12},
booktitle = {2019 {IEEE} {International} {Symposium} on {High} {Performance} {Computer} {Architecture} ({HPCA})},
publisher = {IEEE},
author = {Fuchs, Adi and Wentzlaff, David},
month = feb,
year = {2019},
keywords = {r265-week-1},
pages = {1--14},
}
@article{doweck_inside_2017,
title = {Inside 6th-{Generation} {Intel} {Core}: {New} {Microarchitecture} {Code}-{Named} {Skylake}},
volume = {37},
issn = {1937-4143},
shorttitle = {Inside 6th-{Generation} {Intel} {Core}},
doi = {10.1109/MM.2017.38},
abstract = {Skylake's core, processor graphics, and system on chip were designed to meet a demanding set of requirements for a wide range of power-performance points. Its coherent fabric was designed to provide high-memory bandwidth from multiple memory sources. Skylake's power management, which includes Intel Speed Shift technology, was designed to provide the largest dynamic power range among prior Intel processors. The Intel Architecture core delivers higher power efficiency, higher frequency, and a wider dynamic power range, supporting smaller form factors. Skylake's Gen9 graphics provides new features designed to maximize energy efficiency and bring the best visual experience for gaming and media. Skylake offers a rich performance monitoring unit that enhances software developers' ability to optimize their applications.},
number = {2},
journal = {IEEE Micro},
author = {Doweck, Jack and Kao, Wen-Fu and Lu, Allen Kuan-yu and Mandelblat, Julius and Rahatekar, Anirudha and Rappoport, Lihu and Rotem, Efraim and Yasin, Ahmad and Yoaz, Adi},
month = mar,
year = {2017},
note = {Conference Name: IEEE Micro},
keywords = {Bandwidth, Central Processing Unit, Dynamic range, GPU, Graphics, Graphics processing units, Intel Speed Shift, Microarchitecture, Performance evaluation, Ports (Computers), Skylake, Turbo, eDRAM, microarchitecture, performance measurements, performance monitoring, power management, r265-week-2},
pages = {52--62},
}
@article{esmaeilzadeh_dark_2012,
title = {Dark {Silicon} and the {End} of {Multicore} {Scaling}},
volume = {32},
issn = {1937-4143},
doi = {10.1109/MM.2012.17},
abstract = {A key question for the microprocessor research and design community is whether scaling multicores will provide the performance and value needed to scale down many more technology generations. To provide a quantitative answer to this question, a comprehensive study that projects the speedup potential of future multicores and examines the underutilization of integration capacity-dark silicon-is timely and crucial.},
number = {3},
journal = {IEEE Micro},
author = {Esmaeilzadeh, Hadi and Blem, Emily and St. Amant, Renee and Sankaralingam, Karthikeyan and Burger, Doug},
month = may,
year = {2012},
note = {Conference Name: IEEE Micro},
keywords = {Benchmark testing, Microarchitecture, Moore's law, Multicore processing, Network topology, Performance evaluation, Silicon, Transistors, dark silicon, modeling, multicore, power, r265-week-1, technology scaling},
pages = {122--134},
}
@article{agarwal_clock_nodate,
title = {Clock {Rate} versus {IPC}: {The} {End} of the {Road} for {Conventional} {Microarchitectures}},
abstract = {The doubling of microprocessor performance every three years has been the result of two factors: more transistors per chip and superlinear scaling of the processor clock with technology generation. Our results show that, due to both diminishing improvements in clock rates and poor wire scaling as semiconductor devices shrink, the achievable performance growth of conventional microarchitectures will slow substantially. In this paper, we describe technology-driven models for wire capacitance, wire delay, and microarchitectural component delay. Using the results of these models, we measure the simulated performance—estimating both clock rate and IPC—of an aggressive out-of-order microarchitecture as it is scaled from a 250nm technology to a 35nm technology. We perform this analysis for three clock scaling targets and two microarchitecture scaling strategies: pipeline scaling and capacity scaling. We find that no scaling strategy permits annual performance improvements of better than 12.5\%, which is far worse than the annual 50-60\% to which we have grown accustomed.},
language = {en},
author = {Agarwal, Vikas and Hrishikesh, M S and Keckler, Stephen W and Burger, Doug},
keywords = {r265-week-1},
pages = {12},
}
@article{celio_broom_2019,
title = {{BROOM}: {An} {Open}-{Source} {Out}-of-{Order} {Processor} {With} {Resilient} {Low}-{Voltage} {Operation} in 28-nm {CMOS}},
volume = {39},
issn = {1937-4143},
shorttitle = {{BROOM}},
doi = {10.1109/MM.2019.2897782},
abstract = {The Berkeley resilient out-of-order machine (BROOM) is a resilient, wide-voltage-range implementation of an open-source out-of-order (OoO) RISC-V processor implemented in an ASIC flow. A 28-nm test-chip contains a BOOM OoO core and a 1-MiB level-2 (L2) cache, enhanced with architectural error tolerance for low-voltage operation. It was implemented by using an agile design methodology, where the initial OoO architecture was transformed to perform well in a high-performance, low-leakage CMOS process, informed by synthesis, place, and route data by using foundry-provided standard-cell library and memory compiler. The two-person-team productivity was improved in part thanks to a number of open-source artifacts: The Chisel hardware construction language, the RISC-V instruction set architecture, the rocket-chip SoC generator, and the open-source BOOM core. The resulting chip, taped out using TSMC's 28-nm HPM process, runs at 1.0 GHz at 0.9 V, and is able to operate down to 0.47 V.},
number = {2},
journal = {IEEE Micro},
author = {Celio, Christopher and Chiu, Pi-Feng and Asanović, Krste and Nikolić, Borivoje and Patterson, David},
month = mar,
year = {2019},
note = {Conference Name: IEEE Micro},
keywords = {Agile software development, CMOS process, Design methodology, Generators, Open source software, Random access memory, Voltage control, r265-week-2},
pages = {52--60},
}
@misc{henderson_newreno_2012,
title = {The {NewReno} {Modification} to {TCP}'s {Fast} {Recovery} {Algorithm}},
url = {https://tools.ietf.org/html/rfc6582},
urldate = {2021-05-13},
author = {Henderson, T. and Floyd, S. and Gurtov, A. and Nishida, Y.},
month = apr,
year = {2012},
}
@misc{cloudflare_cloudflare_nodate,
title = {Cloudflare - {The} {Web} {Performance} \& {Security} {Company}},
url = {https://www.cloudflare.com/},
abstract = {Here at Cloudflare, we make the Internet work the way it should. Offering CDN, DNS, DDoS protection and security, find out how we can help your site.},
language = {en-us},
urldate = {2021-05-12},
journal = {Cloudflare},
author = {Cloudflare, Inc.},
}
@article{donenfeld_formal_nodate,
title = {Formal {Verification} of the {WireGuard} {Protocol}},
abstract = {WireGuard, the secure network tunnel, uses an interesting DiffieHellman authenticated key exchange protocol based on NoiseIK, custom tailored to suit its unique operational requirements. This paper enumerates the security properties of this key exchange and then explores the formal verification of such properties. The end result is a formally verified secure network tunnel protocol.},
language = {en},
author = {Donenfeld, Jason A and Milner, Kevin},
pages = {11},
}
@misc{donenfeld_wireguard_2020,
title = {wireguard fixes for 5.6-rc7},
url = {https://lore.kernel.org/netdev/20200319003047.113501-1-Jason@zx2c4.com/},
urldate = {2021-05-12},
author = {Donenfeld, Jason A},
month = mar,
year = {2020},
}
@incollection{preneel_cryptographic_2018,
address = {Cham},
title = {A {Cryptographic} {Analysis} of the {WireGuard} {Protocol}},
volume = {10892},
isbn = {978-3-319-93386-3 978-3-319-93387-0},
url = {http://link.springer.com/10.1007/978-3-319-93387-0_1},
abstract = {WireGuard (Donenfeld, NDSS 2017) is a recently proposed secure network tunnel operating at layer 3. WireGuard aims to replace existing tunnelling solutions like IPsec and OpenVPN, while requiring less code, being more secure, more performant, and easier to use. The cryptographic design of WireGuard is based on the Noise framework. It makes use of a key exchange component which combines long-term and ephemeral Diffie-Hellman values (along with optional preshared keys). This is followed by the use of the established keys in an AEAD construction to encapsulate IP packets in UDP. To date, WireGuard has received no rigorous security analysis. In this paper, we, rectify this. We first observe that, in order to prevent Key Compromise Impersonation (KCI) attacks, any analysis of WireGuards key exchange component must take into account the first AEAD ciphertext from initiator to responder. This message effectively acts as a key confirmation and makes the key exchange component of WireGuard a 1.5 RTT protocol. However, the fact that this ciphertext is computed using the established session key rules out a proof of session key indistinguishability for WireGuards key exchange component, limiting the degree of modularity that is achievable when analysing the protocols security. To overcome this proof barrier, and as an alternative to performing a monolithic analysis of the entire WireGuard protocol, we add an extra message to the protocol. This is done in a minimally invasive way that does not increase the number of round trips needed by the overall WireGuard protocol. This change enables us to prove strong authentication and key indistinguishability properties for the key exchange component of WireGuard under standard cryptographic assumptions.},
language = {en},
urldate = {2021-05-12},
booktitle = {Applied {Cryptography} and {Network} {Security}},
publisher = {Springer International Publishing},
author = {Dowling, Benjamin and Paterson, Kenneth G.},
editor = {Preneel, Bart and Vercauteren, Frederik},
year = {2018},
doi = {10.1007/978-3-319-93387-0_1},
note = {Series Title: Lecture Notes in Computer Science},
pages = {3--21},
}
@misc{torvalds_linux_2020,
title = {Linux 5.6 - {Linus} {Torvalds}},
url = {https://lore.kernel.org/lkml/CAHk-=wi9ZT7Stg-uSpX0UWQzam6OP9Jzz6Xu1CkYu1cicpD5OA@mail.gmail.com/},
urldate = {2021-05-12},
author = {Torvalds, Linus},
month = mar,
year = {2020},
}
@misc{pennarun_how_2020,
title = {How {Tailscale} works},
url = {https://tailscale.com/blog/how-tailscale-works/},
abstract = {People often ask us for an overview of how Tailscale works. We\’ve been},
language = {en},
urldate = {2021-05-12},
journal = {Tailscale},
author = {Pennarun, Avery},
month = mar,
year = {2020},
}
@misc{kinnear_boost_2020,
title = {Boost performance and security with modern networking - {WWDC} 2020 - {Videos}},
url = {https://developer.apple.com/videos/play/wwdc2020/10111/?time=644},
abstract = {Speed up your app and make it more nimble, private and secure with modern networking APIs. Learn about networking protocols like IPv6,...},
language = {en},
urldate = {2021-05-12},
journal = {Apple Developer},
author = {Kinnear, Eric},
month = jun,
year = {2020},
}
@misc{govindan_enabling_2020,
title = {Enabling {QUIC} in tip-of-tree},
url = {https://groups.google.com/a/chromium.org/g/net-dev/c/5M9Z5mtvg_Y/m/iw9co1VrBQAJ?pli=1},
urldate = {2021-05-12},
author = {Govindan, Dharani},
month = apr,
year = {2020},
}
@misc{damjanovic_quic_2021,
title = {{QUIC} and {HTTP}/3 {Support} now in {Firefox} {Nightly} and {Beta} {Mozilla} {Hacks} - the {Web} developer blog},
url = {https://hacks.mozilla.org/2021/04/quic-and-http-3-support-now-in-firefox-nightly-and-beta},
abstract = {Support for QUIC and HTTP/3 is now enabled by default in Firefox Nightly and Firefox Beta. HTTP/3 will be available by the end of May.},
language = {en-US},
urldate = {2021-05-12},
journal = {Mozilla Hacks the Web developer blog},
author = {Damjanovic, Dragana},
month = apr,
year = {2021},
}
@article{roychoudhuri_impact_2006,
series = {Monitoring and {Measurements} of {IP} {Networks}},
title = {On the impact of loss and delay variation on {Internet} packet audio transmission},
volume = {29},
issn = {0140-3664},
url = {https://www.sciencedirect.com/science/article/pii/S0140366406001381},
doi = {10.1016/j.comcom.2006.04.004},
abstract = {The quality of audio in IP telephony is significantly influenced by various factors, including type of encoder, delay, delay variation, rate and distribution of packet loss, and type of error concealment. Hence, the performance of IP telephony systems is highly dependent on understanding the contribution of these factors to audio quality, and their impact on adaptive transport mechanisms such as error and buffer control. We conducted a large-scale audio transmission experiment over the Internet in a 12-month-period in order to evaluate the effects and the correlation of such parameters on audio transmission over IP. We have noticed that the correlation of loss and delay is not linear, but stronger correlation is observed as the delay approaches certain thresholds. We have made a number of new observations on various delay thresholds that are significant for loss prediction for adaptive audio transmission over IP networks. We also have made new observations to assess the audio quality of PCM μ-law and G.728 codecs under different loss and delay conditions. The paper provides a number of recommendations for implementing efficient adaptive FEC mechanisms based on our measurement observations and analysis.},
language = {en},
number = {10},
urldate = {2021-05-09},
journal = {Computer Communications},
author = {Roychoudhuri, Lopamudra and Al-Shaer, Ehab and Brewster, Gregory B.},
month = jun,
year = {2006},
keywords = {IP telephony, Internet measurement, Monitoring, VoIP},
pages = {1578--1589},
}
@inproceedings{amin_assessing_2013,
address = {Berlin, Heidelberg},
series = {Lecture {Notes} in {Computer} {Science}},
title = {Assessing the {Impact} of {Latency} and {Jitter} on the {Perceived} {Quality} of {Call} of {Duty} {Modern} {Warfare} 2},
isbn = {978-3-642-39265-8},
doi = {10.1007/978-3-642-39265-8_11},
abstract = {Jane McGonigal stated in her 2010 TED Talk that humans spend 3 billion hours a week playing video games around the planet. Americans alone devote 183 million hours per week to gaming. With numbers like these, its no wonder why end user demands for bandwidth have increased exponentially and the potential for network congestion is always present. We conduct a user study that focuses on the question: “How much network impairment is acceptable before users are dissatisfied?” In particular, the main objective of our study is to measure a gamers perceived Quality of Experience (QoE) for a real-time first person shooter (FPS) online game Call of Duty Modern Warfare 2 in presence of varied levels of network congestion. We develop a Mean Opinion Score (MOS) metric to determine each gamers QoE. We investigate the following hypothesis: The gamers perceived QoE correlates to their skill level.},
language = {en},
booktitle = {Human-{Computer} {Interaction}. {Users} and {Contexts} of {Use}},
publisher = {Springer},
author = {Amin, Rahul and Jackson, France and Gilbert, Juan E. and Martin, Jim and Shaw, Terry},
editor = {Kurosu, Masaaki},
year = {2013},
keywords = {First Person Shooter Games, Network Impairment, Online Gaming, Quality of Experience},
pages = {97--106},
}
@inproceedings{honda_understanding_2005,
title = {Understanding {TCP} over {TCP}: effects of {TCP} tunneling on end-to-end throughput and latency},
volume = {6011},
shorttitle = {Understanding {TCP} over {TCP}},
url = {https://www.spiedigitallibrary.org/conference-proceedings-of-spie/6011/60110H/Understanding-TCP-over-TCP--effects-of-TCP-tunneling-on/10.1117/12.630496.short},
doi = {10.1117/12.630496},
abstract = {TCP tunnel is a technology that aggregates and transfers packets sent between end hosts as a single TCP connection. By using a TCP tunnel, the fairness among aggregated flows can be improved and several protocols can be transparently transmitted through a firewall. Currently, many applications such as SSH, VTun, and HTun use a TCP tunnel. However, since most applications running on end hosts generally use TCP, two TCP congestion controls (i.e., end-to-end TCP and tunnel TCP) operate simultaneously and interfere each other. Under certain conditions, it has been known that using a TCP tunnel severely degrades the end-to-end TCP performance. Namely, it has known that using a TCP tunnel drastically degrades the end-to-end TCP throughput for some time, which is called \textit{TCP meltdown} problem. On the contrary, under other conditions, it has been known that using a TCP tunnel significantly improves the end-to-end TCP performance. However, it is still an open issue --- how, when, and why is a TCP tunnel malicious for end-to-end TCP performance? In this paper, we therefore investigate effect of TCP tunnel on end-to-end TCP performance using simulation experiments. Specifically, we quantitatively reveal effects of several factors (e.g., the propagation delay, usage of SACK option, TCP socket buffer size, and sender buffer size of TCP tunnel) on performance of end-to-end TCP and tunnel TCP.},
urldate = {2021-04-30},
booktitle = {Performance, {Quality} of {Service}, and {Control} of {Next}-{Generation} {Communication} and {Sensor} {Networks} {III}},
publisher = {International Society for Optics and Photonics},
author = {Honda, Osamu and Ohsaki, Hiroyuki and Imase, Makoto and Ishizuka, Mika and Murayama, Junichi},
month = oct,
year = {2005},
pages = {60110H},
}
@misc{bishop_hypertext_2021,
title = {Hypertext {Transfer} {Protocol} {Version} 3 ({HTTP}/3)},
url = {https://tools.ietf.org/html/draft-ietf-quic-http-34},
language = {en},
urldate = {2021-04-30},
author = {Bishop, Mike},
month = feb,
year = {2021},
}
@misc{schooler_sip_2002,
title = {{SIP}: {Session} {Initiation} {Protocol}},
shorttitle = {{SIP}},
url = {https://tools.ietf.org/html/rfc3261},
language = {en},
urldate = {2021-04-30},
author = {Schooler, Eve and Camarillo, Gonzalo and Handley, Mark and Peterson, Jon and Rosenberg, Jonathan and Johnston, Alan and Schulzrinne, Henning and Sparks, Robert},
month = jun,
year = {2002},
}
@misc{postel_user_1980,
title = {User {Datagram} {Protocol}},
url = {https://tools.ietf.org/html/rfc768},
language = {en},
urldate = {2021-03-01},
author = {Postel, J.},
month = aug,
year = {1980},
}
@article{kohler_designing_nodate,
title = {Designing {DCCP}: {Congestion} {Control} {Without} {Reliability}},
abstract = {DCCP, the Datagram Congestion Control Protocol, is a new transport protocol in the TCP/UDP family that provides a congestion-controlled flow of unreliable datagrams. Delay-sensitive applications, such as streaming media and telephony, prefer timeliness to reliability. These applications have historically used UDP and implemented their own congestion control mechanisms—a difficult task—or no congestion control at all. DCCP will make it easy to deploy these applications without risking congestion collapse. It aims to add to a UDP-like foundation the minimum mechanisms necessary to support congestion control, such as possibly-reliable transmission of acknowledgement information. This minimal design should make DCCP suitable as a building block for more advanced application semantics, such as selective reliability. We introduce and motivate the protocol and discuss some of its design principles. Those principles particularly shed light on the ways TCPs reliable byte-stream semantics influence its implementation of congestion control.},
language = {en},
author = {Kohler, Eddie and Handley, Mark and Floyd, Sally},
pages = {12},
}
@misc{kent_ip_2005,
title = {{IP} {Authentication} {Header}},
url = {https://tools.ietf.org/html/rfc4302},
language = {en},
urldate = {2021-01-29},
author = {Kent, Stephen},
month = dec,
year = {2005},
}
@article{dolev_security_1983,
title = {On the {Security} of {Public} {Key} {Protocols}},
language = {en},
number = {2},
journal = {IEEE TRANSACTIONS ON INFORMATION THEORY},
author = {Dolev, Danny and Yao, Andrew C.},
year = {1983},
pages = {11},
}
@misc{beck_manifesto_2001,
title = {Manifesto for {Agile} {Software} {Development}},
url = {http://agilemanifesto.org/},
urldate = {2021-01-29},
author = {Beck, Kent and Beedle, Mike and van Bekkenum, Arie and Cockburn, Alistair and Cunningham, Ward and Fowler, Martin and Grenning, James and Highsmith, Jim and Hunt, Andrew and Jeffries, Ron and Kern, Jon and Marick, Brian and Martin, Robert C. and Mellor, Steve and Schwaber, Ken and Sutherland, Jeff and Thomas, Dave},
year = {2001},
}
@book{menezes_handbook_1997,
address = {Boca Raton},
series = {{CRC} {Press} series on discrete mathematics and its applications},
title = {Handbook of applied cryptography},
isbn = {978-0-8493-8523-0},
publisher = {CRC Press},
author = {Menezes, A. J. and Van Oorschot, Paul C. and Vanstone, Scott A.},
year = {1997},
keywords = {Access control Handbooks, manuals, etc, Computers, Cryptography, Handbooks, manuals, etc},
}
@techreport{dworkin_recommendation_2005,
address = {Gaithersburg, MD},
title = {Recommendation for block cipher modes of operation :: the {CMAC} mode for authentication},
shorttitle = {Recommendation for block cipher modes of operation},
url = {https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-38b.pdf},
abstract = {This Recommendation specifies a message authentication code (MAC) algorithm based on a symmetric key block cipher. This block cipher-based MAC algorithm, called CMAC, may be used to provide assurance of the authenticity and, hence, the integrity of binary data.},
language = {en},
number = {NIST SP 800-38b},
urldate = {2021-01-01},
institution = {National Institute of Standards and Technology},
author = {Dworkin, M J},
month = may,
year = {2005},
doi = {10.6028/NIST.SP.800-38b},
note = {Edition: 0},
pages = {NIST SP 800--38b},
}
@misc{poovendran_aes-cmac_2006,
title = {The {AES}-{CMAC} {Algorithm}},
url = {https://tools.ietf.org/html/rfc4493#section-2.4},
language = {en},
urldate = {2021-01-01},
author = {Poovendran, Radha and Lee, Jicheol},
month = jun,
year = {2006},
}
@misc{dalal_improving_2010,
title = {Improving {TCP}'s {Robustness} to {Blind} {In}-{Window} {Attacks}},
url = {https://tools.ietf.org/html/rfc5961#ref-SITW},
language = {en},
urldate = {2021-01-01},
author = {Dalal, Mitesh and Stewart, Randall R. and Ramaiah, Anantha},
month = aug,
year = {2010},
}
@techreport{watson_slipping_2003,
type = {Technical {Whitepaper}},
title = {{SLIPPING} {IN} {THE} {WINDOW}: {TCP} {RESET} {ATTACKS}},
language = {en},
author = {Watson, Paul A},
month = oct,
year = {2003},
pages = {33},
}
@inproceedings{wischik_design_2011,
address = {USA},
series = {{NSDI}'11},
title = {Design, implementation and evaluation of congestion control for multipath {TCP}},
abstract = {Multipath TCP, as proposed by the IETF working group mptcp, allows a single data stream to be split across multiple paths. This has obvious benefits for reliability, and it can also lead to more efficient use of networked resources. We describe the design of a multipath congestion control algorithm, we implement it in Linux, and we evaluate it for multihomed servers, data centers and mobile clients. We show that some 'obvious' solutions for multipath congestion control can be harmful, but that our algorithm improves throughput and fairness compared to single-path TCP. Our algorithmis a drop-in replacement for TCP, and we believe it is safe to deploy.},
urldate = {2021-01-01},
booktitle = {Proceedings of the 8th {USENIX} conference on {Networked} systems design and implementation},
publisher = {USENIX Association},
author = {Wischik, Damon and Raiciu, Costin and Greenhalgh, Adam and Handley, Mark},
month = mar,
year = {2011},
pages = {99--112},
}
@misc{tsou_ipsec_2012,
title = {{IPsec} {Anti}-{Replay} {Algorithm} without {Bit} {Shifting}},
url = {https://tools.ietf.org/html/rfc6479},
language = {en},
urldate = {2021-01-01},
author = {Tsou, Tina and Zhang, Xiangyang},
month = jan,
year = {2012},
}
@misc{krawczyk_hmac_1997,
title = {{HMAC}: {Keyed}-{Hashing} for {Message} {Authentication}},
shorttitle = {{HMAC}},
url = {https://tools.ietf.org/html/rfc2104#ref-BCK1},
language = {en},
urldate = {2020-12-31},
author = {Krawczyk, Hugo and Canetti, Ran and Bellare, Mihir},
month = feb,
year = {1997},
}
@misc{handley_tcp_2020,
title = {{TCP} {Extensions} for {Multipath} {Operation} with {Multiple} {Addresses}},
url = {https://tools.ietf.org/html/rfc8684},
language = {en},
urldate = {2020-12-22},
author = {Handley, Mark and Bonaventure, Olivier and Raiciu, Costin and Ford, Alan and Paasch, Christoph},
month = mar,
year = {2020},
}
@book{anderson_security_2008,
address = {Indianapolis, IN},
edition = {2nd ed},
title = {Security engineering: a guide to building dependable distributed systems},
isbn = {978-0-470-06852-6},
shorttitle = {Security engineering},
publisher = {Wiley Pub},
author = {Anderson, Ross},
year = {2008},
note = {OCLC: ocn192045774},
keywords = {Computer security, Distributed processing, Electronic data processing},
}
@inproceedings{wischik_control_2009,
address = {Berlin, Heidelberg},
series = {Lecture {Notes} in {Computer} {Science}},
title = {Control of {Multipath} {TCP} and {Optimization} of {Multipath} {Routing} in the {Internet}},
isbn = {978-3-642-10406-0},
doi = {10.1007/978-3-642-10406-0_14},
abstract = {There are moves in the Internet architecture community to add multipath capabilities to TCP, so that end-systems will be able to shift their traffic away from congested parts of the network. We study two problems relating to the design of multipath TCP. (i) We investigate stochastic packet-level behaviour of some proposed multipath congestion control algorithms, and find that they do not behave how we might expect from fluid modeling: they tend to flap randomly between their available paths. We explain why, and propose a congestion control algorithm that does not flap. (ii) We consider how the path choice offered by the network affects the ability of end-systems to shift their traffic between a pool of resources. We define a resource poolability metric, which measures for each resource how easy it is for traffic to be shifted away from that resource e.g. in the event of a traffic surge or link failure.},
language = {en},
booktitle = {Network {Control} and {Optimization}},
publisher = {Springer},
author = {Wischik, Damon and Handley, Mark and Raiciu, Costin},
editor = {Núñez-Queija, Rudesindo and Resing, Jacques},
year = {2009},
keywords = {congestion control, fluid model, load balancing, multipath TCP, resource pooling},
pages = {204--218},
}
@article{sharma_road_2020,
title = {The {Road} {Not} {Taken}: {Re}-thinking the {Feasibility} of {Voice} {Calling} {Over} {Tor}},
volume = {2020},
shorttitle = {The {Road} {Not} {Taken}},
url = {https://content.sciendo.com/view/journals/popets/2020/4/article-p69.xml},
doi = {10.2478/popets-2020-0063},
abstract = {{\textless}section class="abstract"{\textgreater}{\textless}h2 class="abstractTitle text-title my-1" id="d516e2"{\textgreater}Abstract{\textless}/h2{\textgreater}{\textless}p{\textgreater}Anonymous VoIP calls over the Internet holds great significance for privacy-conscious users, whistle-blowers and political activists alike. Prior research deems popular anonymization systems like Tor unsuitable for providing the requisite performance guarantees that real-time applications like VoIP need. Their claims are backed by studies that may no longer be valid due to constant advancements in Tor. Moreover, we believe that these studies lacked the requisite diversity and comprehensiveness. Thus, conclusions from these studies, led them to propose novel and tailored solutions. However, no such system is available for immediate use. Additionally, operating such new systems would incur significant costs for recruiting users and volunteered relays, to provide the necessary anonymity guarantees.{\textless}/p{\textgreater}{\textless}p{\textgreater}It thus becomes an imperative that the exact performance of VoIP over Tor be quantified and analyzed, so that the potential performance bottlenecks can be amended. We thus conducted an extensive empirical study across various in-lab and real world scenarios to shed light on VoIP performance over Tor. In over half a million calls spanning 12 months, across seven countries and covering about 6650 Tor relays, we observed that {\textless}em{\textgreater}Tor supports good voice quality (Perceptual Evaluation of Speech Quality (PESQ) \>{\textless}/em{\textgreater}3 {\textless}em{\textgreater}and one-way delay \<{\textless}/em{\textgreater}400 {\textless}em{\textgreater}ms) in more than 85\% of cases{\textless}/em{\textgreater}. Further analysis indicates that in general for most Tor relays, the contentions due to cross-traffic were low enough to support VoIP calls, that are anyways transmitted at low rates (\<120 Kbps). Our findings are supported by concordant measurements using iperf that show more than the adequate available bandwidth for most cases. Hence, unlike prior efforts, our research reveals that Tor is suitable for supporting anonymous VoIP calls.{\textless}/p{\textgreater}{\textless}/section{\textgreater}},
language = {en},
number = {4},
urldate = {2020-12-03},
journal = {Proceedings on Privacy Enhancing Technologies},
author = {Sharma, Piyush Kumar and Chaudhary, Shashwat and Hassija, Nikhil and Maity, Mukulika and Chakravarty, Sambuddho},
month = oct,
year = {2020},
note = {Publisher: Sciendo
Section: Proceedings on Privacy Enhancing Technologies},
pages = {69--88},
}
@incollection{hutchison_blake2_2013,
address = {Berlin, Heidelberg},
title = {{BLAKE2}: {Simpler}, {Smaller}, {Fast} as {MD5}},
volume = {7954},
isbn = {978-3-642-38979-5 978-3-642-38980-1},
shorttitle = {{BLAKE2}},
url = {http://link.springer.com/10.1007/978-3-642-38980-1_8},
abstract = {We present the hash function BLAKE2, an improved version of the SHA-3 finalist BLAKE optimized for speed in software. Target applications include cloud storage, intrusion detection, or version control systems. BLAKE2 comes in two main flavors: BLAKE2b is optimized for 64-bit platforms, and BLAKE2s for smaller architectures. On 64bit platforms, BLAKE2 is often faster than MD5, yet provides security similar to that of SHA-3: up to 256-bit collision resistance, immunity to length extension, indifferentiability from a random oracle, etc. We specify parallel versions BLAKE2bp and BLAKE2sp that are up to 4 and 8 times faster, by taking advantage of SIMD and/or multiple cores. BLAKE2 reduces the RAM requirements of BLAKE down to 168 bytes, making it smaller than any of the five SHA-3 finalists, and 32\% smaller than BLAKE. Finally, BLAKE2 provides a comprehensive support for tree-hashing as well as keyed hashing (be it in sequential or tree mode).},
language = {en},
urldate = {2020-11-28},
booktitle = {Applied {Cryptography} and {Network} {Security}},
publisher = {Springer Berlin Heidelberg},
author = {Aumasson, Jean-Philippe and Neves, Samuel and Wilcox-OHearn, Zooko and Winnerlein, Christian},
editor = {Hutchison, David and Kanade, Takeo and Kittler, Josef and Kleinberg, Jon M. and Mattern, Friedemann and Mitchell, John C. and Naor, Moni and Nierstrasz, Oscar and Pandu Rangan, C. and Steffen, Bernhard and Sudan, Madhu and Terzopoulos, Demetri and Tygar, Doug and Vardi, Moshe Y. and Weikum, Gerhard and Jacobson, Michael and Locasto, Michael and Mohassel, Payman and Safavi-Naini, Reihaneh},
year = {2013},
doi = {10.1007/978-3-642-38980-1_8},
note = {Series Title: Lecture Notes in Computer Science},
pages = {119--135},
}
@article{peng_multipath_2016,
title = {Multipath {TCP}: {Analysis}, {Design}, and {Implementation}},
volume = {24},
issn = {1558-2566},
shorttitle = {Multipath {TCP}},
doi = {10.1109/TNET.2014.2379698},
abstract = {Multipath TCP (MP-TCP) has the potential to greatly improve application performance by using multiple paths transparently. We propose a fluid model for a large class of MP-TCP algorithms and identify design criteria that guarantee the existence, uniqueness, and stability of system equilibrium. We clarify how algorithm parameters impact TCP-friendliness, responsiveness, and window oscillation and demonstrate an inevitable tradeoff among these properties. We discuss the implications of these properties on the behavior of existing algorithms and motivate our algorithm Balia (balanced linked adaptation), which generalizes existing algorithms and strikes a good balance among TCP-friendliness, responsiveness, and window oscillation. We have implemented Balia in the Linux kernel. We use our prototype to compare the new algorithm to existing MP-TCP algorithms.},
number = {1},
journal = {IEEE/ACM Transactions on Networking},
author = {Peng, Q. and Walid, A. and Hwang, J. and Low, S. H.},
month = feb,
year = {2016},
note = {Conference Name: IEEE/ACM Transactions on Networking},
keywords = {Aggregates, Algorithm design and analysis, Asymptotic stability, Balia algorithm, Computer networks, Heuristic algorithms, Linux kernel, MP-TCP algorithms, Oscillators, Stability analysis, TCPIP, Vectors, balanced linked adaptation, convergence, multipath TCP, nonlinear dynamical systems, transport protocols, window oscillation},
pages = {596--609},
}
@misc{ofcom_performance_2020,
title = {The performance of fixed-line broadband delivered to {UK} residential customers},
shorttitle = {{UK} {Home} {Broadband} {Performance}},
url = {https://www.ofcom.org.uk/research-and-data/telecoms-research/broadband-research/home-broadband-performance-2019},
abstract = {Our annual home broadband performance report compares how different broadband packages perform, using data from monitors installed on people's broadband routers.},
language = {en},
urldate = {2020-11-21},
journal = {Ofcom},
author = {Ofcom},
month = may,
year = {2020},
}
@inproceedings{hacker_effects_2002,
title = {The {Effects} of {Systemic} {Packet} {Loss} on {Aggregate} {TCP} {Flows}},
doi = {10.1109/SC.2002.10029},
abstract = {The use of parallel TCP connections to increase throughput for bulk transfers is common practice within the high performance computing community. However, the effectiveness, fairness, and efficiency of data transfers across parallel connections is unclear. This paper considers the impact of systemic non-congestion related packet loss on the effectiveness, fairness, and efficiency of parallel TCP transmissions. The results indicate that parallel connections are effective at increasing aggregate throughput, and increase the overall efficiency of the network bottleneck. In the presence of congestion related losses, parallel flows steal bandwidth from other single stream flows. A simple modification is presented that reduces the fairness problems when congestion is present, but retains effectiveness and efficiency.},
booktitle = {{SC} '02: {Proceedings} of the 2002 {ACM}/{IEEE} {Conference} on {Supercomputing}},
author = {Hacker, T. J. and Noble, B. D. and Athey, B. D.},
month = nov,
year = {2002},
note = {ISSN: 1063-9535},
keywords = {Aggregates, Bandwidth, Biology computing, Computer hacking, Concurrent computing, High performance computing, Internet, Loss measurement, Robustness, Throughput},
pages = {7--7},
}
@inproceedings{donenfeld_wireguard_2017,
address = {San Diego, CA},
title = {{WireGuard}: {Next} {Generation} {Kernel} {Network} {Tunnel}},
isbn = {978-1-891562-46-4},
shorttitle = {{WireGuard}},
url = {https://www.ndss-symposium.org/ndss2017/ndss-2017-programme/wireguard-next-generation-kernel-network-tunnel/},
doi = {10.14722/ndss.2017.23160},
abstract = {WireGuard is a secure network tunnel, operating at layer 3, implemented as a kernel virtual network interface for Linux, which aims to replace both IPsec for most use cases, as well as popular user space and/or TLS-based solutions like OpenVPN, while being more secure, more performant, and easier to use. The virtual tunnel interface is based on a proposed fundamental principle of secure tunnels: an association between a peer public key and a tunnel source IP address. It uses a single round trip key exchange, based on NoiseIK, and handles all session creation transparently to the user using a novel timer state machine mechanism. Short pre-shared static keys—Curve25519 points—are used for mutual authentication in the style of OpenSSH. The protocol provides strong perfect forward secrecy in addition to a high degree of identity hiding. Transport speed is accomplished using ChaCha20Poly1305 authenticated-encryption for encapsulation of packets in UDP. An improved take on IP-binding cookies is used for mitigating denial of service attacks, improving greatly on IKEv2 and DTLSs cookie mechanisms to add encryption and authentication. The overall design allows for allocating no resources in response to received packets, and from a systems perspective, there are multiple interesting Linux implementation techniques for queues and parallelism. Finally, WireGuard can be simply implemented for Linux in less than 4,000 lines of code, making it easily audited and verified.},
language = {en},
urldate = {2020-11-19},
booktitle = {Proceedings 2017 {Network} and {Distributed} {System} {Security} {Symposium}},
publisher = {Internet Society},
author = {Donenfeld, Jason A.},
year = {2017},
}