@inproceedings{pan2023dorewards, title = {{Do the Rewards Justify the Means? Measuring Trade-Offs Between Rewards and Ethical Behavior in the \emph{Machiavelli} Benchmark}}, author = {Pan, Alexander and Shern, Chan Jun and Zou, Andy and Li, Nathaniel and Basart, Steven and Woodside, Thomas and Ng, Jonathan and Zhang, Hanlin and Emmons, Scott and Hendrycks, Dan}, year = {2023}, booktitle = {International Conference on Machine Learning}, pages = {26837--26867}, organization = {PMLR} }