smtp: inline addressparser code

2020-06-16 20:11:19 -07:00 · 2020-06-16 20:11:19 -07:00 · e575f1cb19
parent 28242e8d5e
commit e575f1cb19
5 changed files with 478 additions and 4 deletions
--- a/email.ts
+++ b/email.ts
@ -1,3 +1,4 @@
+export * from './smtp/address';
 export * from './smtp/client';
 export * from './smtp/connection';
 export * from './smtp/date';
--- a/smtp/address.ts
+++ b/smtp/address.ts
@ -0,0 +1,326 @@
+/*
+ * Operator tokens and which tokens are expected to end the sequence
+ */
+const OPERATORS = {
+	'"': '"',
+	'(': ')',
+	'<': '>',
+	',': '',
+	// Groups are ended by semicolons
+	':': ';',
+	// Semicolons are not a legal delimiter per the RFC2822 grammar other
+	// than for terminating a group, but they are also not valid for any
+	// other use in this context.  Given that some mail clients have
+	// historically allowed the semicolon as a delimiter equivalent to the
+	// comma in their UI, it makes sense to treat them the same as a comma
+	// when used outside of a group.
+	';': '',
+};
+
+interface TokenizerNode {
+	type: 'operator' | 'text';
+	value: string;
+}
+
+export interface AddressObject {
+	address?: string;
+	name?: string;
+	group?: AddressObject[];
+}
+
+/**
+ * Creates a Tokenizer object for tokenizing address field strings
+ *
+ * @constructor
+ * @param {String} str Address field string
+ */
+class Tokenizer {
+	private operatorExpecting = '';
+	private node?: TokenizerNode;
+	private escaped = false;
+	private list: TokenizerNode[] = [];
+	private str: string;
+
+	constructor(str: string | string[] = '') {
+		this.str = str.toString();
+	}
+
+	/**
+	 * Tokenizes the original input string
+	 *
+	 * @return {Array} An array of operator|text tokens
+	 */
+	public tokenize() {
+		let chr;
+		const list: TokenizerNode[] = [];
+
+		for (let i = 0, len = this.str.length; i < len; i++) {
+			chr = this.str.charAt(i);
+			this.checkChar(chr);
+		}
+
+		for (const node of this.list) {
+			node.value = (node.value || '').toString().trim();
+			if (node.value) {
+				list.push(node);
+			}
+		}
+
+		return list;
+	}
+
+	/**
+	 * Checks if a character is an operator or text and acts accordingly
+	 *
+	 * @param {string} chr Character from the address field
+	 * @returns {void}
+	 */
+	public checkChar(chr: string) {
+		if ((chr in OPERATORS || chr === '\\') && this.escaped) {
+			this.escaped = false;
+		} else if (this.operatorExpecting && chr === this.operatorExpecting) {
+			this.node = {
+				type: 'operator',
+				value: chr,
+			};
+			this.list.push(this.node);
+			this.node = undefined;
+			this.operatorExpecting = '';
+			this.escaped = false;
+			return;
+		} else if (!this.operatorExpecting && chr in OPERATORS) {
+			this.node = {
+				type: 'operator',
+				value: chr,
+			};
+			this.list.push(this.node);
+			this.node = undefined;
+			this.operatorExpecting = OPERATORS[chr as keyof typeof OPERATORS];
+			this.escaped = false;
+			return;
+		}
+
+		if (!this.escaped && chr === '\\') {
+			this.escaped = true;
+			return;
+		}
+
+		if (!this.node) {
+			this.node = {
+				type: 'text',
+				value: '',
+			};
+			this.list.push(this.node);
+		}
+
+		if (this.escaped && chr !== '\\') {
+			this.node.value += '\\';
+		}
+
+		this.node.value += chr;
+		this.escaped = false;
+	}
+}
+/**
+ * Converts tokens for a single address into an address object
+ *
+ * @param {TokenizerNode[]} tokens Tokens object
+ * @return {AddressObject[]} addresses object array
+ */
+function handleAddress(tokens: TokenizerNode[]) {
+	let isGroup = false;
+	let state = 'text';
+
+	let address: AddressObject;
+
+	let addresses: string[] = [];
+	let comments: string[] = [];
+	let texts: string[] = [];
+
+	const groups: string[] = [];
+	const addressObjects: AddressObject[] = [];
+
+	const data: {
+		address: string;
+		comment: string;
+		group: string;
+		text: string;
+	} = {
+		address: '',
+		comment: '',
+		group: '',
+		text: '',
+	};
+
+	// Filter out <addresses>, (comments) and regular text
+	for (let i = 0, len = tokens.length; i < len; i++) {
+		const token = tokens[i];
+
+		if (token.type === 'operator') {
+			switch (token.value) {
+				case '<':
+					state = 'address';
+					break;
+				case '(':
+					state = 'comment';
+					break;
+				case ':':
+					state = 'group';
+					isGroup = true;
+					break;
+				default:
+					state = 'text';
+					break;
+			}
+		} else {
+			if (token.value) {
+				switch (state) {
+					case 'address':
+						addresses.push(token.value);
+						break;
+					case 'comment':
+						comments.push(token.value);
+						break;
+					case 'group':
+						groups.push(token.value);
+						break;
+					default:
+						texts.push(token.value);
+						break;
+				}
+			}
+		}
+	}
+
+	// If there is no text but a comment, replace the two
+	if (texts.length === 0 && comments.length > 0) {
+		texts = [...comments];
+		comments = [];
+	}
+
+	if (isGroup) {
+		// http://tools.ietf.org/html/rfc2822#appendix-A.1.3
+		data.text = texts.join(' ');
+		addressObjects.push({
+			name: data.text || undefined,
+			group: groups.length > 0 ? addressparser(groups.join(',')) : [],
+		});
+	} else {
+		// If no address was found, try to detect one from regular text
+		if (addresses.length === 0 && texts.length > 0) {
+			for (let i = texts.length - 1; i >= 0; i--) {
+				if (texts[i].match(/^[^@\s]+@[^@\s]+$/)) {
+					addresses = texts.splice(i, 1);
+					break;
+				}
+			}
+
+			const _regexHandler = function (address: string) {
+				if (addresses.length === 0) {
+					addresses = [address.trim()];
+					return ' ';
+				} else {
+					return address;
+				}
+			};
+
+			// still no address
+			if (addresses.length === 0) {
+				for (let i = texts.length - 1; i >= 0; i--) {
+					texts[i] = texts[i]
+						.replace(/\s*\b[^@\s]+@[^@\s]+\b\s*/, _regexHandler)
+						.trim();
+					if (addresses.length) {
+						break;
+					}
+				}
+			}
+		}
+
+		// If there's still is no text but a comment exixts, replace the two
+		if (texts.length === 0 && comments.length > 0) {
+			texts = [...comments];
+			comments = [];
+		}
+
+		// Keep only the first address occurence, push others to regular text
+		if (addresses.length > 1) {
+			texts = texts.concat(addresses.splice(1));
+		}
+
+		// Join values with spaces
+		data.text = texts.join(' ');
+		data.address = addresses.join(' ');
+
+		if (!data.address && isGroup) {
+			return [];
+		} else {
+			address = {
+				address: data.address || data.text || '',
+				name: data.text || data.address || '',
+			};
+
+			if (address.address === address.name) {
+				if ((address.address || '').match(/@/)) {
+					address.name = '';
+				} else {
+					address.address = '';
+				}
+			}
+
+			addressObjects.push(address);
+		}
+	}
+
+	return addressObjects;
+}
+
+/**
+ * Parses structured e-mail addresses from an address field
+ *
+ * Example:
+ *
+ *    "Name <address@domain>"
+ *
+ * will be converted to
+ *
+ *     [{name: "Name", address: "address@domain"}]
+ *
+ * @param {string} str Address field
+ * @return {AddressObject[]} An array of address objects
+ */
+export function addressparser(str?: string | string[]) {
+	const tokenizer = new Tokenizer(str);
+	const tokens = tokenizer.tokenize();
+
+	const addresses: TokenizerNode[][] = [];
+	let address: TokenizerNode[] = [];
+	let parsedAddresses: AddressObject[] = [];
+
+	for (const token of tokens) {
+		if (
+			token.type === 'operator' &&
+			(token.value === ',' || token.value === ';')
+		) {
+			if (address.length) {
+				addresses.push(address);
+			}
+			address = [];
+		} else {
+			address.push(token);
+		}
+	}
+
+	if (address.length) {
+		addresses.push(address);
+	}
+
+	for (const address of addresses) {
+		const handled = handleAddress(address);
+		if (handled.length) {
+			parsedAddresses = parsedAddresses.concat(handled);
+		}
+	}
+
+	return parsedAddresses;
+}
--- a/smtp/client.ts
+++ b/smtp/client.ts
@ -1,4 +1,4 @@
-import addressparser from 'addressparser';
+import { addressparser } from './address';
 import { Message } from './message';
 import type { MessageAttachment, MessageHeaders } from './message';
 import { SMTPConnection, SMTPState } from './connection';
@ -120,7 +120,7 @@ export class SMTPClient {
 			const parsedReturnPath = addressparser(returnPath);
 			if (parsedReturnPath.length > 0) {
 				const [{ address: returnPathAddress }] = parsedReturnPath;
-				stack.returnPath = returnPathAddress;
+				stack.returnPath = returnPathAddress as string;
 			}
 		}

--- a/smtp/message.ts
+++ b/smtp/message.ts
@ -3,8 +3,7 @@ import type { PathLike, ReadStream } from 'fs';
 import { hostname } from 'os';
 import { Stream } from 'stream';

-import addressparser from 'addressparser';
-
+import { addressparser } from './address';
 import { getRFC2822Date } from './date';
 import { mimeWordEncode } from './mime';

--- a/test/address.ts
+++ b/test/address.ts
@ -0,0 +1,148 @@
+import test from 'ava';
+import { addressparser } from '../email';
+
+test('addressparser should handle single address correctly', async (t) => {
+	t.deepEqual(addressparser('andris@tr.ee'), [
+		{ address: 'andris@tr.ee', name: '' },
+	]);
+});
+
+test('addressparser should handle multiple addresses correctly', async (t) => {
+	t.deepEqual(addressparser('andris@tr.ee, andris@example.com'), [
+		{ address: 'andris@tr.ee', name: '' },
+		{ address: 'andris@example.com', name: '' },
+	]);
+});
+
+test('addressparser should handle unquoted name correctly', async (t) => {
+	t.deepEqual(addressparser('andris <andris@tr.ee>'), [
+		{ name: 'andris', address: 'andris@tr.ee' },
+	]);
+});
+
+test('addressparser should handle quoted name correctly', async (t) => {
+	t.deepEqual(addressparser('"reinman, andris" <andris@tr.ee>'), [
+		{ name: 'reinman, andris', address: 'andris@tr.ee' },
+	]);
+});
+
+test('addressparser should handle quoted semicolons correctly', async (t) => {
+	t.deepEqual(addressparser('"reinman; andris" <andris@tr.ee>'), [
+		{ name: 'reinman; andris', address: 'andris@tr.ee' },
+	]);
+});
+
+test('addressparser should handle unquoted name, unquoted address correctly', async (t) => {
+	t.deepEqual(addressparser('andris andris@tr.ee'), [
+		{ name: 'andris', address: 'andris@tr.ee' },
+	]);
+});
+
+test('addressparser should handle empty group correctly', async (t) => {
+	t.deepEqual(addressparser('Undisclosed:;'), [
+		{ name: 'Undisclosed', group: [] },
+	]);
+});
+
+test('addressparser should handle address group correctly', async (t) => {
+	t.deepEqual(addressparser('Disclosed:andris@tr.ee, andris@example.com;'), [
+		{
+			name: 'Disclosed',
+			group: [
+				{ address: 'andris@tr.ee', name: '' },
+				{ address: 'andris@example.com', name: '' },
+			],
+		},
+	]);
+});
+
+test('addressparser should handle semicolon as a delimiter', async (t) => {
+	t.deepEqual(addressparser('andris@tr.ee; andris@example.com;'), [
+		{ address: 'andris@tr.ee', name: '' },
+		{ address: 'andris@example.com', name: '' },
+	]);
+});
+
+test('addressparser should handle mixed group correctly', async (t) => {
+	t.deepEqual(
+		addressparser(
+			'Test User <test.user@mail.ee>, Disclosed:andris@tr.ee, andris@example.com;,,,, Undisclosed:;'
+		),
+		[
+			{ address: 'test.user@mail.ee', name: 'Test User' },
+			{
+				name: 'Disclosed',
+				group: [
+					{ address: 'andris@tr.ee', name: '' },
+					{ address: 'andris@example.com', name: '' },
+				],
+			},
+			{ name: 'Undisclosed', group: [] },
+		]
+	);
+});
+
+test('addressparser semicolon as delimiter should not break group parsing ', async (t) => {
+	t.deepEqual(
+		addressparser(
+			'Test User <test.user@mail.ee>; Disclosed:andris@tr.ee, andris@example.com;,,,, Undisclosed:; bob@example.com;'
+		),
+		[
+			{ address: 'test.user@mail.ee', name: 'Test User' },
+			{
+				name: 'Disclosed',
+				group: [
+					{
+						address: 'andris@tr.ee',
+						name: '',
+					},
+					{
+						address: 'andris@example.com',
+						name: '',
+					},
+				],
+			},
+			{ name: 'Undisclosed', group: [] },
+			{ address: 'bob@example.com', name: '' },
+		]
+	);
+});
+
+test('addressparser should handle name from comment correctly', async (t) => {
+	t.deepEqual(addressparser('andris@tr.ee (andris)'), [
+		{ name: 'andris', address: 'andris@tr.ee' },
+	]);
+});
+
+test('addressparser should handle skip comment correctly', async (t) => {
+	t.deepEqual(addressparser('andris@tr.ee (reinman) andris'), [
+		{ name: 'andris', address: 'andris@tr.ee' },
+	]);
+});
+
+test('addressparser should handle missing address correctly', async (t) => {
+	t.deepEqual(addressparser('andris'), [{ name: 'andris', address: '' }]);
+});
+
+test('addressparser should handle apostrophe in name correctly', async (t) => {
+	t.deepEqual(addressparser("O'Neill"), [{ name: "O'Neill", address: '' }]);
+});
+
+test('addressparser should handle particularly bad input, unescaped colon correctly', async (t) => {
+	t.deepEqual(
+		addressparser(
+			'FirstName Surname-WithADash :: Company <firstname@company.com>'
+		),
+		[
+			{
+				name: 'FirstName Surname-WithADash',
+				group: [
+					{
+						name: undefined,
+						group: [{ address: 'firstname@company.com', name: 'Company' }],
+					},
+				],
+			},
+		]
+	);
+});